diff --git a/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml b/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml index 1d5e9bcce4c8..33f3010f48c3 100644 --- a/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml +++ b/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml @@ -62,7 +62,7 @@ examples: }; mmc@5b010000 { - compatible = "fsl,imx8qxp-usdhc"; + compatible = "fsl,imx8qxp-usdhc", "fsl,imx7d-usdhc"; interrupts = ; reg = <0x5b010000 0x10000>; clocks = <&conn_lpcg IMX_CONN_LPCG_SDHC0_IPG_CLK>, diff --git a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml index 10b45966f1b8..e71d13c2d109 100644 --- a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml +++ b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml @@ -21,23 +21,26 @@ description: | properties: compatible: - enum: - - fsl,imx25-esdhc - - fsl,imx35-esdhc - - fsl,imx51-esdhc - - fsl,imx53-esdhc - - fsl,imx6q-usdhc - - fsl,imx6sl-usdhc - - fsl,imx6sx-usdhc - - fsl,imx6ull-usdhc - - fsl,imx7d-usdhc - - fsl,imx7ulp-usdhc - - fsl,imx8mq-usdhc - - fsl,imx8mm-usdhc - - fsl,imx8mn-usdhc - - fsl,imx8mp-usdhc - - fsl,imx8qm-usdhc - - fsl,imx8qxp-usdhc + oneOf: + - enum: + - fsl,imx25-esdhc + - fsl,imx35-esdhc + - fsl,imx51-esdhc + - fsl,imx53-esdhc + - fsl,imx6q-usdhc + - fsl,imx6sl-usdhc + - fsl,imx6sx-usdhc + - fsl,imx6ull-usdhc + - fsl,imx7d-usdhc + - fsl,imx7ulp-usdhc + - items: + - enum: + - fsl,imx8mm-usdhc + - fsl,imx8mn-usdhc + - fsl,imx8mp-usdhc + - fsl,imx8mq-usdhc + - fsl,imx8qxp-usdhc + - const: fsl,imx7d-usdhc reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/mmc/microchip,dw-sparx5-sdhci.yaml b/Documentation/devicetree/bindings/mmc/microchip,dw-sparx5-sdhci.yaml new file mode 100644 index 000000000000..55883290543b --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/microchip,dw-sparx5-sdhci.yaml @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mmc/microchip,dw-sparx5-sdhci.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip Sparx5 Mobile Storage Host Controller Binding + +allOf: + - $ref: "mmc-controller.yaml" + +maintainers: + - Lars Povlsen + +# Everything else is described in the common file +properties: + compatible: + const: microchip,dw-sparx5-sdhci + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + description: + Handle to "core" clock for the sdhci controller. + + clock-names: + items: + - const: core + + microchip,clock-delay: + description: Delay clock to card to meet setup time requirements. + Each step increase by 1.25ns. + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 1 + maximum: 15 + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +examples: + - | + #include + #include + sdhci0: mmc@600800000 { + compatible = "microchip,dw-sparx5-sdhci"; + reg = <0x00800000 0x1000>; + pinctrl-0 = <&emmc_pins>; + pinctrl-names = "default"; + clocks = <&clks CLK_ID_AUX1>; + clock-names = "core"; + assigned-clocks = <&clks CLK_ID_AUX1>; + assigned-clock-rates = <800000000>; + interrupts = ; + bus-width = <8>; + microchip,clock-delay = <10>; + }; diff --git a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml index b96da0c7f819..f928f66fc59a 100644 --- a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml +++ b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml @@ -14,6 +14,10 @@ description: | that requires the respective functionality should implement them using these definitions. + It is possible to assign a fixed index mmcN to an MMC host controller + (and the corresponding mmcblkN devices) by defining an alias in the + /aliases device tree node. + properties: $nodename: pattern: "^mmc(@.*)?$" diff --git a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml index 449215444723..8d625f903856 100644 --- a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml +++ b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml @@ -20,6 +20,8 @@ properties: reset-gpios: minItems: 1 + # Put some limit to avoid false warnings + maxItems: 32 description: contains a list of GPIO specifiers. The reset GPIOs are asserted at initialization and prior we start the power up procedure of the card. diff --git a/Documentation/devicetree/bindings/mmc/owl-mmc.yaml b/Documentation/devicetree/bindings/mmc/owl-mmc.yaml index 1380501fb8f0..5eab25ccf7ae 100644 --- a/Documentation/devicetree/bindings/mmc/owl-mmc.yaml +++ b/Documentation/devicetree/bindings/mmc/owl-mmc.yaml @@ -14,7 +14,11 @@ maintainers: properties: compatible: - const: actions,owl-mmc + oneOf: + - const: actions,owl-mmc + - items: + - const: actions,s700-mmc + - const: actions,owl-mmc reg: maxItems: 1 diff --git a/Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml b/Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml index b4c3fd40caeb..6bbf29b5c239 100644 --- a/Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml +++ b/Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml @@ -50,6 +50,7 @@ properties: - renesas,sdhi-r8a774a1 # RZ/G2M - renesas,sdhi-r8a774b1 # RZ/G2N - renesas,sdhi-r8a774c0 # RZ/G2E + - renesas,sdhi-r8a774e1 # RZ/G2H - renesas,sdhi-r8a7795 # R-Car H3 - renesas,sdhi-r8a7796 # R-Car M3-W - renesas,sdhi-r8a77961 # R-Car M3-W+ diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt b/Documentation/devicetree/bindings/mmc/sdhci-am654.txt deleted file mode 100644 index 6d202f4d9249..000000000000 --- a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt +++ /dev/null @@ -1,61 +0,0 @@ -Device Tree Bindings for the SDHCI Controllers present on TI's AM654 SOCs - -The bindings follow the mmc[1], clock[2] and interrupt[3] bindings. -Only deviations are documented here. - - [1] Documentation/devicetree/bindings/mmc/mmc.txt - [2] Documentation/devicetree/bindings/clock/clock-bindings.txt - [3] Documentation/devicetree/bindings/interrupt-controller/interrupts.txt - -Required Properties: - - compatible: should be one of: - "ti,am654-sdhci-5.1": SDHCI on AM654 device. - "ti,j721e-sdhci-8bit": 8 bit SDHCI on J721E device. - "ti,j721e-sdhci-4bit": 4 bit SDHCI on J721E device. - - reg: Must be two entries. - - The first should be the sdhci register space - - The second should the subsystem/phy register space - - clocks: Handles to the clock inputs. - - clock-names: Tuple including "clk_xin" and "clk_ahb" - - interrupts: Interrupt specifiers - Output tap delay for each speed mode: - - ti,otap-del-sel-legacy - - ti,otap-del-sel-mmc-hs - - ti,otap-del-sel-sd-hs - - ti,otap-del-sel-sdr12 - - ti,otap-del-sel-sdr25 - - ti,otap-del-sel-sdr50 - - ti,otap-del-sel-sdr104 - - ti,otap-del-sel-ddr50 - - ti,otap-del-sel-ddr52 - - ti,otap-del-sel-hs200 - - ti,otap-del-sel-hs400 - These bindings must be provided otherwise the driver will disable the - corresponding speed mode (i.e. all nodes must provide at least -legacy) - -Optional Properties (Required for ti,am654-sdhci-5.1 and ti,j721e-sdhci-8bit): - - ti,trm-icp: DLL trim select - - ti,driver-strength-ohm: driver strength in ohms. - Valid values are 33, 40, 50, 66 and 100 ohms. -Optional Properties: - - ti,strobe-sel: strobe select delay for HS400 speed mode. Default value: 0x0. - - ti,clkbuf-sel: Clock Delay Buffer Select - -Example: - - sdhci0: sdhci@4f80000 { - compatible = "ti,am654-sdhci-5.1"; - reg = <0x0 0x4f80000 0x0 0x260>, <0x0 0x4f90000 0x0 0x134>; - power-domains = <&k3_pds 47>; - clocks = <&k3_clks 47 0>, <&k3_clks 47 1>; - clock-names = "clk_ahb", "clk_xin"; - interrupts = ; - sdhci-caps-mask = <0x80000007 0x0>; - mmc-ddr-1_8v; - ti,otap-del-sel-legacy = <0x0>; - ti,otap-del-sel-mmc-hs = <0x0>; - ti,otap-del-sel-ddr52 = <0x5>; - ti,otap-del-sel-hs200 = <0x5>; - ti,otap-del-sel-hs400 = <0x0>; - ti,trm-icp = <0x8>; - }; diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml b/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml new file mode 100644 index 000000000000..ac79f3adf20b --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml @@ -0,0 +1,218 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2020 Texas Instruments Incorporated - http://www.ti.com/ +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/mmc/sdhci-am654.yaml#" +$schema : "http://devicetree.org/meta-schemas/core.yaml#" + +title: TI AM654 MMC Controller + +maintainers: + - Ulf Hansson + +allOf: + - $ref: mmc-controller.yaml# + +properties: + compatible: + enum: + - ti,am654-sdhci-5.1 + - ti,j721e-sdhci-8bit + - ti,j721e-sdhci-4bit + - ti,j7200-sdhci-8bit + - ti,j721e-sdhci-4bit + + reg: + maxItems: 2 + + interrupts: + maxItems: 1 + + power-domains: + maxItems: 1 + + clocks: + minItems: 1 + maxItems: 2 + description: Handles to input clocks + + clock-names: + minItems: 1 + maxItems: 2 + items: + - const: clk_ahb + - const: clk_xin + + # PHY output tap delays: + # Used to delay the data valid window and align it to the sampling clock. + # Binding needs to be provided for each supported speed mode otherwise the + # corresponding mode will be disabled. + + ti,otap-del-sel-legacy: + description: Output tap delay for SD/MMC legacy timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-mmc-hs: + description: Output tap delay for MMC high speed timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sd-hs: + description: Output tap delay for SD high speed timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sdr12: + description: Output tap delay for SD UHS SDR12 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sdr25: + description: Output tap delay for SD UHS SDR25 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sdr50: + description: Output tap delay for SD UHS SDR50 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sdr104: + description: Output tap delay for SD UHS SDR104 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-ddr50: + description: Output tap delay for SD UHS DDR50 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-ddr52: + description: Output tap delay for eMMC DDR52 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-hs200: + description: Output tap delay for eMMC HS200 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-hs400: + description: Output tap delay for eMMC HS400 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + # PHY input tap delays: + # Used to delay the data valid window and align it to the sampling clock for + # modes that don't support tuning + + ti,itap-del-sel-legacy: + description: Input tap delay for SD/MMC legacy timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-mmc-hs: + description: Input tap delay for MMC high speed timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-sd-hs: + description: Input tap delay for SD high speed timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-sdr12: + description: Input tap delay for SD UHS SDR12 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-sdr25: + description: Input tap delay for SD UHS SDR25 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-ddr52: + description: Input tap delay for MMC DDR52 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,trm-icp: + description: DLL trim select + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,driver-strength-ohm: + description: DLL drive strength in ohms + $ref: "/schemas/types.yaml#/definitions/uint32" + oneOf: + - enum: + - 33 + - 40 + - 50 + - 66 + - 100 + + ti,strobe-sel: + description: strobe select delay for HS400 speed mode. + $ref: "/schemas/types.yaml#/definitions/uint32" + + ti,clkbuf-sel: + description: Clock Delay Buffer Select + $ref: "/schemas/types.yaml#/definitions/uint32" + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + - ti,otap-del-sel-legacy + +examples: + - | + #include + #include + + bus { + #address-cells = <2>; + #size-cells = <2>; + + mmc0: mmc@4f80000 { + compatible = "ti,am654-sdhci-5.1"; + reg = <0x0 0x4f80000 0x0 0x260>, <0x0 0x4f90000 0x0 0x134>; + power-domains = <&k3_pds 47>; + clocks = <&k3_clks 47 0>, <&k3_clks 47 1>; + clock-names = "clk_ahb", "clk_xin"; + interrupts = ; + sdhci-caps-mask = <0x80000007 0x0>; + mmc-ddr-1_8v; + ti,otap-del-sel-legacy = <0x0>; + ti,otap-del-sel-mmc-hs = <0x0>; + ti,otap-del-sel-ddr52 = <0x5>; + ti,otap-del-sel-hs200 = <0x5>; + ti,otap-del-sel-hs400 = <0x0>; + ti,itap-del-sel-legacy = <0x10>; + ti,itap-del-sel-mmc-hs = <0xa>; + ti,itap-del-sel-ddr52 = <0x3>; + ti,trm-icp = <0x8>; + }; + }; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 178954228631..8004dd64d09a 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1223,7 +1223,7 @@ static int msb_read_boot_blocks(struct msb_data *msb) } if (be16_to_cpu(page->header.block_id) != MS_BLOCK_BOOT_ID) { - dbg("the pba at %d doesn' contain boot block ID", pba); + dbg("the pba at %d doesn't contain boot block ID", pba); continue; } diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 70207f11a654..c2e70b757dd1 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -68,6 +68,7 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) { struct mmc_card *card = mmc_dev_to_card(dev); const char *type; + unsigned int i; int retval = 0; switch (card->type) { @@ -98,6 +99,17 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) card->cis.vendor, card->cis.device); if (retval) return retval; + + retval = add_uevent_var(env, "SDIO_REVISION=%u.%u", + card->major_rev, card->minor_rev); + if (retval) + return retval; + + for (i = 0; i < card->num_info; i++) { + retval = add_uevent_var(env, "SDIO_INFO%u=%s", i+1, card->info[i]); + if (retval) + return retval; + } } /* diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 8ccae6452b9c..d42037f0f10d 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -2063,6 +2063,16 @@ static void mmc_hw_reset_for_init(struct mmc_host *host) host->ops->hw_reset(host); } +/** + * mmc_hw_reset - reset the card in hardware + * @host: MMC host to which the card is attached + * + * Hard reset the card. This function is only for upper layers, like the + * block layer or card drivers. You cannot use it in host drivers (struct + * mmc_card might be gone then). + * + * Return: 0 on success, -errno on failure + */ int mmc_hw_reset(struct mmc_host *host) { int ret; diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index d576eeca7e14..3e11217857bc 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -376,6 +376,20 @@ int mmc_of_parse_voltage(struct device_node *np, u32 *mask) } EXPORT_SYMBOL(mmc_of_parse_voltage); +/** + * mmc_first_nonreserved_index() - get the first index that is not reserved + */ +static int mmc_first_nonreserved_index(void) +{ + int max; + + max = of_alias_get_highest_id("mmc"); + if (max < 0) + return 0; + + return max + 1; +} + /** * mmc_alloc_host - initialise the per-host structure. * @extra: sizeof private data structure @@ -387,6 +401,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) { int err; struct mmc_host *host; + int alias_id, min_idx, max_idx; host = kzalloc(sizeof(struct mmc_host) + extra, GFP_KERNEL); if (!host) @@ -395,7 +410,16 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) /* scanning will be enabled when we're ready */ host->rescan_disable = 1; - err = ida_simple_get(&mmc_host_ida, 0, 0, GFP_KERNEL); + alias_id = of_alias_get_id(dev->of_node, "mmc"); + if (alias_id >= 0) { + min_idx = alias_id; + max_idx = alias_id + 1; + } else { + min_idx = mmc_first_nonreserved_index(); + max_idx = 0; + } + + err = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL); if (err < 0) { kfree(host); return NULL; diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index b3fa193de846..ff3063ce2acd 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1168,13 +1168,13 @@ static int mmc_select_hs400(struct mmc_card *card) return err; } - /* Set host controller to HS timing */ - mmc_set_timing(card->host, MMC_TIMING_MMC_HS); - /* Prepare host to downgrade to HS timing */ if (host->ops->hs400_downgrade) host->ops->hs400_downgrade(host); + /* Set host controller to HS timing */ + mmc_set_timing(host, MMC_TIMING_MMC_HS); + /* Reduce frequency to HS frequency */ max_dtr = card->ext_csd.hs_max_dtr; mmc_set_clock(host, max_dtr); @@ -1253,6 +1253,9 @@ int mmc_hs400_to_hs200(struct mmc_card *card) if (err) goto out_err; + if (host->ops->hs400_downgrade) + host->ops->hs400_downgrade(host); + mmc_set_timing(host, MMC_TIMING_MMC_DDR52); err = mmc_switch_status(card, true); @@ -1268,9 +1271,6 @@ int mmc_hs400_to_hs200(struct mmc_card *card) mmc_set_timing(host, MMC_TIMING_MMC_HS); - if (host->ops->hs400_downgrade) - host->ops->hs400_downgrade(host); - err = mmc_switch_status(card, true); if (err) goto out_err; @@ -1763,13 +1763,17 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, goto free_card; if (mmc_card_hs200(card)) { + host->doing_init_tune = 1; + err = mmc_hs200_tuning(card); + if (!err) + err = mmc_select_hs400(card); + + host->doing_init_tune = 0; + if (err) goto free_card; - err = mmc_select_hs400(card); - if (err) - goto free_card; } else if (!mmc_card_hs400es(card)) { /* Select the desired bus width optionally */ err = mmc_select_bus_width(card); diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c index c21b3cb71775..152e7525ed33 100644 --- a/drivers/mmc/core/mmc_test.c +++ b/drivers/mmc/core/mmc_test.c @@ -2669,22 +2669,22 @@ static const struct mmc_test_case mmc_test_cases[] = { }, { - .name = "Correct xfer_size at write (start failure)", + .name = "Proper xfer_size at write (start failure)", .run = mmc_test_xfersize_write, }, { - .name = "Correct xfer_size at read (start failure)", + .name = "Proper xfer_size at read (start failure)", .run = mmc_test_xfersize_read, }, { - .name = "Correct xfer_size at write (midway failure)", + .name = "Proper xfer_size at write (midway failure)", .run = mmc_test_multi_xfersize_write, }, { - .name = "Correct xfer_size at read (midway failure)", + .name = "Proper xfer_size at read (midway failure)", .run = mmc_test_multi_xfersize_read, }, diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 5a2210c25aa7..6f054c449d46 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -709,10 +709,34 @@ static DEVICE_ATTR(dsr, S_IRUGO, mmc_dsr_show, NULL); MMC_DEV_ATTR(vendor, "0x%04x\n", card->cis.vendor); MMC_DEV_ATTR(device, "0x%04x\n", card->cis.device); +MMC_DEV_ATTR(revision, "%u.%u\n", card->major_rev, card->minor_rev); + +#define sdio_info_attr(num) \ +static ssize_t info##num##_show(struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + struct mmc_card *card = mmc_dev_to_card(dev); \ + \ + if (num > card->num_info) \ + return -ENODATA; \ + if (!card->info[num-1][0]) \ + return 0; \ + return sprintf(buf, "%s\n", card->info[num-1]); \ +} \ +static DEVICE_ATTR_RO(info##num) + +sdio_info_attr(1); +sdio_info_attr(2); +sdio_info_attr(3); +sdio_info_attr(4); static struct attribute *sd_std_attrs[] = { &dev_attr_vendor.attr, &dev_attr_device.attr, + &dev_attr_revision.attr, + &dev_attr_info1.attr, + &dev_attr_info2.attr, + &dev_attr_info3.attr, + &dev_attr_info4.attr, &dev_attr_cid.attr, &dev_attr_csd.attr, &dev_attr_scr.attr, @@ -735,12 +759,18 @@ static struct attribute *sd_std_attrs[] = { static umode_t sd_std_is_visible(struct kobject *kobj, struct attribute *attr, int index) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct mmc_card *card = mmc_dev_to_card(dev); - /* CIS vendor and device ids are available only for Combo cards */ - if ((attr == &dev_attr_vendor.attr || attr == &dev_attr_device.attr) && - card->type != MMC_TYPE_SD_COMBO) + /* CIS vendor and device ids, revision and info string are available only for Combo cards */ + if ((attr == &dev_attr_vendor.attr || + attr == &dev_attr_device.attr || + attr == &dev_attr_revision.attr || + attr == &dev_attr_info1.attr || + attr == &dev_attr_info2.attr || + attr == &dev_attr_info3.attr || + attr == &dev_attr_info4.attr + ) && card->type != MMC_TYPE_SD_COMBO) return 0; return attr->mode; diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c index 7b40553d3934..694a212cbe25 100644 --- a/drivers/mmc/core/sdio.c +++ b/drivers/mmc/core/sdio.c @@ -29,12 +29,36 @@ MMC_DEV_ATTR(vendor, "0x%04x\n", card->cis.vendor); MMC_DEV_ATTR(device, "0x%04x\n", card->cis.device); +MMC_DEV_ATTR(revision, "%u.%u\n", card->major_rev, card->minor_rev); MMC_DEV_ATTR(ocr, "0x%08x\n", card->ocr); MMC_DEV_ATTR(rca, "0x%04x\n", card->rca); +#define sdio_info_attr(num) \ +static ssize_t info##num##_show(struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + struct mmc_card *card = mmc_dev_to_card(dev); \ + \ + if (num > card->num_info) \ + return -ENODATA; \ + if (!card->info[num-1][0]) \ + return 0; \ + return sprintf(buf, "%s\n", card->info[num-1]); \ +} \ +static DEVICE_ATTR_RO(info##num) + +sdio_info_attr(1); +sdio_info_attr(2); +sdio_info_attr(3); +sdio_info_attr(4); + static struct attribute *sdio_std_attrs[] = { &dev_attr_vendor.attr, &dev_attr_device.attr, + &dev_attr_revision.attr, + &dev_attr_info1.attr, + &dev_attr_info2.attr, + &dev_attr_info3.attr, + &dev_attr_info4.attr, &dev_attr_ocr.attr, &dev_attr_rca.attr, NULL, diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 3cc928282af7..3d709029e07c 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -28,34 +28,50 @@ #define to_sdio_driver(d) container_of(d, struct sdio_driver, drv) /* show configuration fields */ -#define sdio_config_attr(field, format_string) \ +#define sdio_config_attr(field, format_string, args...) \ static ssize_t \ field##_show(struct device *dev, struct device_attribute *attr, char *buf) \ { \ struct sdio_func *func; \ \ func = dev_to_sdio_func (dev); \ - return sprintf (buf, format_string, func->field); \ + return sprintf(buf, format_string, args); \ } \ static DEVICE_ATTR_RO(field) -sdio_config_attr(class, "0x%02x\n"); -sdio_config_attr(vendor, "0x%04x\n"); -sdio_config_attr(device, "0x%04x\n"); +sdio_config_attr(class, "0x%02x\n", func->class); +sdio_config_attr(vendor, "0x%04x\n", func->vendor); +sdio_config_attr(device, "0x%04x\n", func->device); +sdio_config_attr(revision, "%u.%u\n", func->major_rev, func->minor_rev); +sdio_config_attr(modalias, "sdio:c%02Xv%04Xd%04X\n", func->class, func->vendor, func->device); -static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct sdio_func *func = dev_to_sdio_func (dev); +#define sdio_info_attr(num) \ +static ssize_t info##num##_show(struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + struct sdio_func *func = dev_to_sdio_func(dev); \ + \ + if (num > func->num_info) \ + return -ENODATA; \ + if (!func->info[num-1][0]) \ + return 0; \ + return sprintf(buf, "%s\n", func->info[num-1]); \ +} \ +static DEVICE_ATTR_RO(info##num) - return sprintf(buf, "sdio:c%02Xv%04Xd%04X\n", - func->class, func->vendor, func->device); -} -static DEVICE_ATTR_RO(modalias); +sdio_info_attr(1); +sdio_info_attr(2); +sdio_info_attr(3); +sdio_info_attr(4); static struct attribute *sdio_dev_attrs[] = { &dev_attr_class.attr, &dev_attr_vendor.attr, &dev_attr_device.attr, + &dev_attr_revision.attr, + &dev_attr_info1.attr, + &dev_attr_info2.attr, + &dev_attr_info3.attr, + &dev_attr_info4.attr, &dev_attr_modalias.attr, NULL, }; @@ -106,6 +122,7 @@ static int sdio_bus_uevent(struct device *dev, struct kobj_uevent_env *env) { struct sdio_func *func = dev_to_sdio_func(dev); + unsigned int i; if (add_uevent_var(env, "SDIO_CLASS=%02X", func->class)) @@ -115,6 +132,15 @@ sdio_bus_uevent(struct device *dev, struct kobj_uevent_env *env) "SDIO_ID=%04X:%04X", func->vendor, func->device)) return -ENOMEM; + if (add_uevent_var(env, + "SDIO_REVISION=%u.%u", func->major_rev, func->minor_rev)) + return -ENOMEM; + + for (i = 0; i < func->num_info; i++) { + if (add_uevent_var(env, "SDIO_INFO%u=%s", i+1, func->info[i])) + return -ENOMEM; + } + if (add_uevent_var(env, "MODALIAS=sdio:c%02Xv%04Xd%04X", func->class, func->vendor, func->device)) diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c index e0655278c5c3..44bea5e4aeda 100644 --- a/drivers/mmc/core/sdio_cis.c +++ b/drivers/mmc/core/sdio_cis.c @@ -23,9 +23,16 @@ static int cistpl_vers_1(struct mmc_card *card, struct sdio_func *func, const unsigned char *buf, unsigned size) { + u8 major_rev, minor_rev; unsigned i, nr_strings; char **buffer, *string; + if (size < 2) + return 0; + + major_rev = buf[0]; + minor_rev = buf[1]; + /* Find all null-terminated (including zero length) strings in the TPLLV1_INFO field. Trailing garbage is ignored. */ buf += 2; @@ -57,9 +64,13 @@ static int cistpl_vers_1(struct mmc_card *card, struct sdio_func *func, } if (func) { + func->major_rev = major_rev; + func->minor_rev = minor_rev; func->num_info = nr_strings; func->info = (const char**)buffer; } else { + card->major_rev = major_rev; + card->minor_rev = minor_rev; card->num_info = nr_strings; card->info = (const char**)buffer; } diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 9a34c827c96e..f0cb7aeabbc4 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -178,7 +178,7 @@ config MMC_SDHCI_OF_AT91 config MMC_SDHCI_OF_ESDHC tristate "SDHCI OF support for the Freescale eSDHC controller" depends on MMC_SDHCI_PLTFM - depends on PPC || ARCH_MXC || ARCH_LAYERSCAPE + depends on PPC || ARCH_MXC || ARCH_LAYERSCAPE || COMPILE_TEST select MMC_SDHCI_IO_ACCESSORS select FSL_GUTS help @@ -213,6 +213,18 @@ config MMC_SDHCI_OF_DWCMSHC If you have a controller with this interface, say Y or M here. If unsure, say N. +config MMC_SDHCI_OF_SPARX5 + tristate "SDHCI OF support for the MCHP Sparx5 SoC" + depends on MMC_SDHCI_PLTFM + depends on ARCH_SPARX5 || COMPILE_TEST + help + This selects the Secure Digital Host Controller Interface (SDHCI) + found in the MCHP Sparx5 SoC. + + If you have a Sparx5 SoC with this interface, say Y or M here. + + If unsure, say N. + config MMC_SDHCI_CADENCE tristate "SDHCI support for the Cadence SD/SDIO/eMMC controller" depends on MMC_SDHCI_PLTFM @@ -226,7 +238,7 @@ config MMC_SDHCI_CADENCE config MMC_SDHCI_CNS3XXX tristate "SDHCI support on the Cavium Networks CNS3xxx SoC" - depends on ARCH_CNS3XXX + depends on ARCH_CNS3XXX || COMPILE_TEST depends on MMC_SDHCI_PLTFM help This selects the SDHCI support for CNS3xxx System-on-Chip devices. @@ -250,7 +262,7 @@ config MMC_SDHCI_ESDHC_MCF config MMC_SDHCI_ESDHC_IMX tristate "SDHCI support for the Freescale eSDHC/uSDHC i.MX controller" - depends on ARCH_MXC + depends on ARCH_MXC || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_CQHCI @@ -264,7 +276,7 @@ config MMC_SDHCI_ESDHC_IMX config MMC_SDHCI_DOVE tristate "SDHCI support on Marvell's Dove SoC" - depends on ARCH_DOVE || MACH_DOVE + depends on ARCH_DOVE || MACH_DOVE || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help @@ -277,7 +289,7 @@ config MMC_SDHCI_DOVE config MMC_SDHCI_TEGRA tristate "SDHCI platform support for the Tegra SD/MMC Controller" - depends on ARCH_TEGRA + depends on ARCH_TEGRA || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_CQHCI @@ -289,7 +301,8 @@ config MMC_SDHCI_TEGRA config MMC_SDHCI_S3C tristate "SDHCI support on Samsung S3C SoC" - depends on MMC_SDHCI && PLAT_SAMSUNG + depends on MMC_SDHCI + depends on PLAT_SAMSUNG || COMPILE_TEST help This selects the Secure Digital Host Controller Interface (SDHCI) often referrered to as the HSMMC block in some of the Samsung S3C @@ -301,7 +314,7 @@ config MMC_SDHCI_S3C config MMC_SDHCI_SIRF tristate "SDHCI support on CSR SiRFprimaII and SiRFmarco SoCs" - depends on ARCH_SIRF + depends on ARCH_SIRF || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help @@ -339,7 +352,8 @@ config MMC_SDHCI_PXAV2 config MMC_SDHCI_SPEAR tristate "SDHCI support on ST SPEAr platform" - depends on MMC_SDHCI && PLAT_SPEAR + depends on MMC_SDHCI + depends on PLAT_SPEAR || COMPILE_TEST depends on OF help This selects the Secure Digital Host Controller Interface (SDHCI) @@ -362,7 +376,7 @@ config MMC_SDHCI_S3C_DMA config MMC_SDHCI_BCM_KONA tristate "SDHCI support on Broadcom KONA platform" - depends on ARCH_BCM_MOBILE + depends on ARCH_BCM_MOBILE || COMPILE_TEST depends on MMC_SDHCI_PLTFM help This selects the Broadcom Kona Secure Digital Host Controller @@ -410,7 +424,8 @@ config MMC_SDHCI_IPROC config MMC_MESON_GX tristate "Amlogic S905/GX*/AXG SD/MMC Host Controller support" - depends on ARCH_MESON && MMC + depends on ARCH_MESON|| COMPILE_TEST + depends on COMMON_CLK help This selects support for the Amlogic SD/MMC Host Controller found on the S905/GX*/AXG family of SoCs. This controller is @@ -446,7 +461,7 @@ config MMC_MESON_MX_SDIO config MMC_MOXART tristate "MOXART SD/MMC Host Controller support" - depends on ARCH_MOXART && MMC + depends on ARCH_MOXART || COMPILE_TEST help This selects support for the MOXART SD/MMC Host Controller. MOXA provides one multi-functional card reader which can @@ -455,7 +470,7 @@ config MMC_MOXART config MMC_SDHCI_ST tristate "SDHCI support on STMicroelectronics SoC" - depends on ARCH_STI || FSP2 + depends on ARCH_STI || FSP2 || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help @@ -525,7 +540,7 @@ config MMC_ATMELMCI config MMC_SDHCI_MSM tristate "Qualcomm SDHCI Controller Support" - depends on ARCH_QCOM || (ARM && COMPILE_TEST) + depends on ARCH_QCOM || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_CQHCI @@ -575,7 +590,7 @@ config MMC_TIFM_SD config MMC_MVSDIO tristate "Marvell MMC/SD/SDIO host driver" - depends on PLAT_ORION + depends on PLAT_ORION || (COMPILE_TEST && ARM) depends on OF help This selects the Marvell SDIO host driver. @@ -587,7 +602,7 @@ config MMC_MVSDIO config MMC_DAVINCI tristate "TI DAVINCI Multimedia Card Interface support" - depends on ARCH_DAVINCI + depends on ARCH_DAVINCI || COMPILE_TEST help This selects the TI DAVINCI Multimedia card Interface. If you have an DAVINCI board with a Multimedia Card slot, @@ -669,7 +684,7 @@ config MMC_SDRICOH_CS config MMC_SDHCI_SPRD tristate "Spreadtrum SDIO host Controller" - depends on ARCH_SPRD + depends on ARCH_SPRD || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_HSQ @@ -686,7 +701,7 @@ config MMC_TMIO_CORE config MMC_TMIO tristate "Toshiba Mobile IO Controller (TMIO) MMC/SD function support" - depends on MFD_TMIO || MFD_ASIC3 + depends on MFD_TMIO || MFD_ASIC3 || COMPILE_TEST select MMC_TMIO_CORE help This provides support for the SD/MMC cell found in TC6393XB, @@ -777,7 +792,7 @@ config MMC_CAVIUM_THUNDERX config MMC_DW tristate "Synopsys DesignWare Memory Card Interface" - depends on ARC || ARM || ARM64 || MIPS || COMPILE_TEST + depends on ARC || ARM || ARM64 || MIPS || RISCV || CSKY || COMPILE_TEST help This selects support for the Synopsys DesignWare Mobile Storage IP block, this provides host support for SD and MMC interfaces, in both @@ -959,7 +974,7 @@ config MMC_REALTEK_USB config MMC_SUNXI tristate "Allwinner sunxi SD/MMC Host Controller support" - depends on ARCH_SUNXI + depends on ARCH_SUNXI || COMPILE_TEST help This selects support for the SD/MMC Host Controller on Allwinner sunxi SoCs. diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 4d5bcb0144a0..451c25fc2c69 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_MMC_SDHCI_OF_AT91) += sdhci-of-at91.o obj-$(CONFIG_MMC_SDHCI_OF_ESDHC) += sdhci-of-esdhc.o obj-$(CONFIG_MMC_SDHCI_OF_HLWD) += sdhci-of-hlwd.o obj-$(CONFIG_MMC_SDHCI_OF_DWCMSHC) += sdhci-of-dwcmshc.o +obj-$(CONFIG_MMC_SDHCI_OF_SPARX5) += sdhci-of-sparx5.o obj-$(CONFIG_MMC_SDHCI_BCM_KONA) += sdhci-bcm-kona.o obj-$(CONFIG_MMC_SDHCI_IPROC) += sdhci-iproc.o obj-$(CONFIG_MMC_SDHCI_MSM) += sdhci-msm.o diff --git a/drivers/mmc/host/alcor.c b/drivers/mmc/host/alcor.c index 026ca9194ce5..bfb8efeb7eb8 100644 --- a/drivers/mmc/host/alcor.c +++ b/drivers/mmc/host/alcor.c @@ -1178,6 +1178,7 @@ static struct platform_driver alcor_pci_sdmmc_driver = { .id_table = alcor_pci_sdmmc_ids, .driver = { .name = DRV_NAME_ALCOR_PCI_SDMMC, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &alcor_mmc_pm_ops }, }; diff --git a/drivers/mmc/host/android-goldfish.c b/drivers/mmc/host/android-goldfish.c index ceb4924e02d0..e878fdf8f20a 100644 --- a/drivers/mmc/host/android-goldfish.c +++ b/drivers/mmc/host/android-goldfish.c @@ -537,6 +537,7 @@ static struct platform_driver goldfish_mmc_driver = { .remove = goldfish_mmc_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 3fc3bbea8536..444bd3a0a922 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -2668,6 +2668,7 @@ static struct platform_driver atmci_driver = { .remove = atmci_remove, .driver = { .name = "atmel_mci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(atmci_dt_ids), .pm = &atmci_dev_pm_ops, }, diff --git a/drivers/mmc/host/au1xmmc.c b/drivers/mmc/host/au1xmmc.c index 9bb1910268ca..bd00515fbaba 100644 --- a/drivers/mmc/host/au1xmmc.c +++ b/drivers/mmc/host/au1xmmc.c @@ -1189,6 +1189,7 @@ static struct platform_driver au1xmmc_driver = { .resume = au1xmmc_resume, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c index a0767790a826..8c2361e66277 100644 --- a/drivers/mmc/host/bcm2835.c +++ b/drivers/mmc/host/bcm2835.c @@ -1406,9 +1406,7 @@ static int bcm2835_probe(struct platform_device *pdev) clk = devm_clk_get(dev, NULL); if (IS_ERR(clk)) { - ret = PTR_ERR(clk); - if (ret != -EPROBE_DEFER) - dev_err(dev, "could not get clk: %d\n", ret); + ret = dev_err_probe(dev, PTR_ERR(clk), "could not get clk\n"); goto err; } @@ -1476,6 +1474,7 @@ static struct platform_driver bcm2835_driver = { .remove = bcm2835_remove, .driver = { .name = "sdhost-bcm2835", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = bcm2835_match, }, }; diff --git a/drivers/mmc/host/cavium-octeon.c b/drivers/mmc/host/cavium-octeon.c index e299cdd1e619..2c4b2df52adb 100644 --- a/drivers/mmc/host/cavium-octeon.c +++ b/drivers/mmc/host/cavium-octeon.c @@ -327,6 +327,7 @@ static struct platform_driver octeon_mmc_driver = { .remove = octeon_mmc_remove, .driver = { .name = KBUILD_MODNAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = octeon_mmc_match, }, }; diff --git a/drivers/mmc/host/cqhci.c b/drivers/mmc/host/cqhci.c index cfa87dfa73d8..697fe40756bf 100644 --- a/drivers/mmc/host/cqhci.c +++ b/drivers/mmc/host/cqhci.c @@ -376,6 +376,9 @@ static void cqhci_off(struct mmc_host *mmc) else pr_debug("%s: cqhci: CQE off\n", mmc_hostname(mmc)); + if (cq_host->ops->post_disable) + cq_host->ops->post_disable(mmc); + mmc->cqe_on = false; } @@ -580,6 +583,9 @@ static int cqhci_request(struct mmc_host *mmc, struct mmc_request *mrq) __cqhci_enable(cq_host); if (!mmc->cqe_on) { + if (cq_host->ops->pre_enable) + cq_host->ops->pre_enable(mmc); + cqhci_writel(cq_host, 0, CQHCI_CTL); mmc->cqe_on = true; pr_debug("%s: cqhci: CQE on\n", mmc_hostname(mmc)); diff --git a/drivers/mmc/host/cqhci.h b/drivers/mmc/host/cqhci.h index 437700179de4..89bf6adbce8c 100644 --- a/drivers/mmc/host/cqhci.h +++ b/drivers/mmc/host/cqhci.h @@ -206,6 +206,8 @@ struct cqhci_host_ops { void (*disable)(struct mmc_host *mmc, bool recovery); void (*update_dcmd_desc)(struct mmc_host *mmc, struct mmc_request *mrq, u64 *data); + void (*pre_enable)(struct mmc_host *mmc); + void (*post_disable)(struct mmc_host *mmc); }; static inline void cqhci_writel(struct cqhci_host *host, u32 val, int reg) diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index e50a08bce7ef..90cd179625fc 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -996,7 +996,7 @@ static irqreturn_t mmc_davinci_irq(int irq, void *dev_id) if (qstatus & MMCST0_RSPDNE) { /* End of command phase */ - end_command = (int) host->cmd; + end_command = host->cmd ? 1 : 0; } if (end_command) @@ -1240,9 +1240,8 @@ static int davinci_mmcsd_probe(struct platform_device *pdev) pdev->id_entry = match->data; ret = mmc_of_parse(mmc); if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, - "could not parse of data: %d\n", ret); + dev_err_probe(&pdev->dev, ret, + "could not parse of data\n"); goto parse_fail; } } else { @@ -1396,6 +1395,7 @@ static const struct dev_pm_ops davinci_mmcsd_pm = { static struct platform_driver davinci_mmcsd_driver = { .driver = { .name = "davinci_mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = davinci_mmcsd_pm_ops, .of_match_table = davinci_mmc_dt_ids, }, diff --git a/drivers/mmc/host/dw_mmc-bluefield.c b/drivers/mmc/host/dw_mmc-bluefield.c index aa38b1a8017e..10baf122bc15 100644 --- a/drivers/mmc/host/dw_mmc-bluefield.c +++ b/drivers/mmc/host/dw_mmc-bluefield.c @@ -55,6 +55,7 @@ static struct platform_driver dw_mci_bluefield_pltfm_driver = { .remove = dw_mci_pltfm_remove, .driver = { .name = "dwmmc_bluefield", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_bluefield_match, .pm = &dw_mci_pltfm_pmops, }, diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index 95adeee07217..0c75810812a0 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -592,6 +592,7 @@ static struct platform_driver dw_mci_exynos_pltfm_driver = { .remove = dw_mci_exynos_remove, .driver = { .name = "dwmmc_exynos", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_exynos_match, .pm = &dw_mci_exynos_pmops, }, diff --git a/drivers/mmc/host/dw_mmc-hi3798cv200.c b/drivers/mmc/host/dw_mmc-hi3798cv200.c index 83e1bad0a008..39794f93826f 100644 --- a/drivers/mmc/host/dw_mmc-hi3798cv200.c +++ b/drivers/mmc/host/dw_mmc-hi3798cv200.c @@ -200,6 +200,7 @@ static struct platform_driver dw_mci_hi3798cv200_driver = { .remove = dw_mci_hi3798cv200_remove, .driver = { .name = "dwmmc_hi3798cv200", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_hi3798cv200_match, }, }; diff --git a/drivers/mmc/host/dw_mmc-k3.c b/drivers/mmc/host/dw_mmc-k3.c index db1a84b2ba61..29d2494eb27a 100644 --- a/drivers/mmc/host/dw_mmc-k3.c +++ b/drivers/mmc/host/dw_mmc-k3.c @@ -473,6 +473,7 @@ static struct platform_driver dw_mci_k3_pltfm_driver = { .remove = dw_mci_pltfm_remove, .driver = { .name = "dwmmc_k3", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_k3_match, .pm = &dw_mci_k3_dev_pm_ops, }, diff --git a/drivers/mmc/host/dw_mmc-pltfm.c b/drivers/mmc/host/dw_mmc-pltfm.c index 7de37f524a96..73731cd3ba23 100644 --- a/drivers/mmc/host/dw_mmc-pltfm.c +++ b/drivers/mmc/host/dw_mmc-pltfm.c @@ -98,6 +98,7 @@ static struct platform_driver dw_mci_pltfm_driver = { .remove = dw_mci_pltfm_remove, .driver = { .name = "dw_mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_pltfm_match, .pm = &dw_mci_pltfm_pmops, }, diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c index d4d02134848c..753502ce3c85 100644 --- a/drivers/mmc/host/dw_mmc-rockchip.c +++ b/drivers/mmc/host/dw_mmc-rockchip.c @@ -383,6 +383,7 @@ static struct platform_driver dw_mci_rockchip_pltfm_driver = { .remove = dw_mci_rockchip_remove, .driver = { .name = "dwmmc_rockchip", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_rockchip_match, .pm = &dw_mci_rockchip_dev_pm_ops, }, diff --git a/drivers/mmc/host/dw_mmc-zx.c b/drivers/mmc/host/dw_mmc-zx.c index eada648b27ec..51bcc6332f3a 100644 --- a/drivers/mmc/host/dw_mmc-zx.c +++ b/drivers/mmc/host/dw_mmc-zx.c @@ -155,7 +155,6 @@ static int dw_mci_zx_parse_dt(struct dw_mci *host) struct device_node *node; struct dw_mci_zx_priv_data *priv; struct regmap *sysc_base; - int ret; /* syscon is needed only by emmc */ node = of_parse_phandle(np, "zte,aon-syscon", 0); @@ -163,13 +162,9 @@ static int dw_mci_zx_parse_dt(struct dw_mci *host) sysc_base = syscon_node_to_regmap(node); of_node_put(node); - if (IS_ERR(sysc_base)) { - ret = PTR_ERR(sysc_base); - if (ret != -EPROBE_DEFER) - dev_err(host->dev, "Can't get syscon: %d\n", - ret); - return ret; - } + if (IS_ERR(sysc_base)) + return dev_err_probe(host->dev, PTR_ERR(sysc_base), + "Can't get syscon\n"); } else { return 0; } @@ -227,6 +222,7 @@ static struct platform_driver dw_mci_zx_pltfm_driver = { .remove = dw_mci_pltfm_remove, .driver = { .name = "dwmmc_zx", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_zx_match, .pm = &dw_mci_zx_dev_pm_ops, }, diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 0fba940544ca..43c5795691fb 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -3161,12 +3161,9 @@ int dw_mci_probe(struct dw_mci *host) if (!host->pdata) { host->pdata = dw_mci_parse_dt(host); - if (PTR_ERR(host->pdata) == -EPROBE_DEFER) { - return -EPROBE_DEFER; - } else if (IS_ERR(host->pdata)) { - dev_err(host->dev, "platform data not available\n"); - return -EINVAL; - } + if (IS_ERR(host->pdata)) + return dev_err_probe(host->dev, PTR_ERR(host->pdata), + "platform data not available\n"); } host->biu_clk = devm_clk_get(host->dev, "biu"); diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index 81d71010b474..a1f92fed2a55 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -991,9 +991,7 @@ static int jz4740_mmc_probe(struct platform_device* pdev) ret = mmc_of_parse(mmc); if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, - "could not parse device properties: %d\n", ret); + dev_err_probe(&pdev->dev, ret, "could not parse device properties\n"); goto err_free_host; } @@ -1126,6 +1124,7 @@ static struct platform_driver jz4740_mmc_driver = { .remove = jz4740_mmc_remove, .driver = { .name = "jz4740-mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(jz4740_mmc_of_match), .pm = pm_ptr(&jz4740_mmc_pm_ops), }, diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 08a3b1c05acb..4ec41579940a 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -426,11 +426,9 @@ static int meson_mmc_clk_init(struct meson_host *host) snprintf(name, sizeof(name), "clkin%d", i); clk = devm_clk_get(host->dev, name); - if (IS_ERR(clk)) { - if (clk != ERR_PTR(-EPROBE_DEFER)) - dev_err(host->dev, "Missing clock %s\n", name); - return PTR_ERR(clk); - } + if (IS_ERR(clk)) + return dev_err_probe(host->dev, PTR_ERR(clk), + "Missing clock %s\n", name); mux_parent_names[i] = __clk_get_name(clk); } @@ -521,7 +519,7 @@ static int meson_mmc_resampling_tuning(struct mmc_host *mmc, u32 opcode) val |= ADJUST_ADJ_EN; writel(val, host->regs + host->data->adjust); - if (mmc->doing_retune) + if (mmc_doing_retune(mmc)) dly = FIELD_GET(ADJUST_ADJ_DELAY_MASK, val) + 1; else dly = 0; @@ -1077,12 +1075,8 @@ static int meson_mmc_probe(struct platform_device *pdev) } ret = device_reset_optional(&pdev->dev); - if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, "device reset failed: %d\n", ret); - - return ret; - } + if (ret) + return dev_err_probe(&pdev->dev, ret, "device reset failed\n"); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); host->regs = devm_ioremap_resource(&pdev->dev, res); @@ -1270,6 +1264,7 @@ static struct platform_driver meson_mmc_driver = { .remove = meson_mmc_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(meson_mmc_of_match), }, }; diff --git a/drivers/mmc/host/meson-mx-sdhc-mmc.c b/drivers/mmc/host/meson-mx-sdhc-mmc.c index 53e3f6a4245a..7cd9c0ec2fcf 100644 --- a/drivers/mmc/host/meson-mx-sdhc-mmc.c +++ b/drivers/mmc/host/meson-mx-sdhc-mmc.c @@ -903,6 +903,7 @@ static struct platform_driver meson_mx_sdhc_driver = { .remove = meson_mx_sdhc_remove, .driver = { .name = "meson-mx-sdhc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(meson_mx_sdhc_of_match), }, }; diff --git a/drivers/mmc/host/meson-mx-sdio.c b/drivers/mmc/host/meson-mx-sdio.c index 703d5834f9a5..1c5299cd0cbe 100644 --- a/drivers/mmc/host/meson-mx-sdio.c +++ b/drivers/mmc/host/meson-mx-sdio.c @@ -755,6 +755,7 @@ static struct platform_driver meson_mx_mmc_driver = { .remove = meson_mx_mmc_remove, .driver = { .name = "meson-mx-sdio", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(meson_mx_mmc_of_match), }, }; diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 18a850f37ddc..02f4fd26e76a 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -882,9 +882,9 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd, else clock_rate = spi->max_speed_hz; - timeout = data->timeout_ns + + timeout = data->timeout_ns / 1000 + data->timeout_clks * 1000000 / clock_rate; - timeout = usecs_to_jiffies((unsigned int)(timeout / 1000)) + 1; + timeout = usecs_to_jiffies((unsigned int)timeout) + 1; /* Handle scatterlist segments one at a time, with synch for * each 512-byte block diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index fc6b9cf27d0b..f25079ba3bca 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -689,19 +689,18 @@ static int moxart_remove(struct platform_device *pdev) dev_set_drvdata(&pdev->dev, NULL); - if (mmc) { - if (!IS_ERR(host->dma_chan_tx)) - dma_release_channel(host->dma_chan_tx); - if (!IS_ERR(host->dma_chan_rx)) - dma_release_channel(host->dma_chan_rx); - mmc_remove_host(mmc); - mmc_free_host(mmc); + if (!IS_ERR(host->dma_chan_tx)) + dma_release_channel(host->dma_chan_tx); + if (!IS_ERR(host->dma_chan_rx)) + dma_release_channel(host->dma_chan_rx); + mmc_remove_host(mmc); + mmc_free_host(mmc); + + writel(0, host->base + REG_INTERRUPT_MASK); + writel(0, host->base + REG_POWER_CONTROL); + writel(readl(host->base + REG_CLOCK_CONTROL) | CLK_OFF, + host->base + REG_CLOCK_CONTROL); - writel(0, host->base + REG_INTERRUPT_MASK); - writel(0, host->base + REG_POWER_CONTROL); - writel(readl(host->base + REG_CLOCK_CONTROL) | CLK_OFF, - host->base + REG_CLOCK_CONTROL); - } return 0; } @@ -717,6 +716,7 @@ static struct platform_driver moxart_mmc_driver = { .remove = moxart_remove, .driver = { .name = "mmc-moxart", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = moxart_mmc_match, }, }; diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index b0c27944db7f..a704745e5882 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -397,7 +397,6 @@ struct msdc_delay_phase { struct msdc_host { struct device *dev; const struct mtk_mmc_compatible *dev_comp; - struct mmc_host *mmc; /* mmc structure */ int cmd_rsp; spinlock_t lock; @@ -734,14 +733,15 @@ static void msdc_unprepare_data(struct msdc_host *host, struct mmc_request *mrq) static u64 msdc_timeout_cal(struct msdc_host *host, u64 ns, u64 clks) { + struct mmc_host *mmc = mmc_from_priv(host); u64 timeout, clk_ns; u32 mode = 0; - if (host->mmc->actual_clock == 0) { + if (mmc->actual_clock == 0) { timeout = 0; } else { clk_ns = 1000000000ULL; - do_div(clk_ns, host->mmc->actual_clock); + do_div(clk_ns, mmc->actual_clock); timeout = ns + clk_ns - 1; do_div(timeout, clk_ns); timeout += clks; @@ -802,6 +802,7 @@ static void msdc_ungate_clock(struct msdc_host *host) static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) { + struct mmc_host *mmc = mmc_from_priv(host); u32 mode; u32 flags; u32 div; @@ -811,7 +812,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) if (!hz) { dev_dbg(host->dev, "set mclk to 0\n"); host->mclk = 0; - host->mmc->actual_clock = 0; + mmc->actual_clock = 0; sdr_clr_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN); return; } @@ -890,7 +891,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) while (!(readl(host->base + MSDC_CFG) & MSDC_CFG_CKSTB)) cpu_relax(); sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN); - host->mmc->actual_clock = sclk; + mmc->actual_clock = sclk; host->mclk = hz; host->timing = timing; /* need because clk changed. */ @@ -901,7 +902,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) * mmc_select_hs400() will drop to 50Mhz and High speed mode, * tune result of hs200/200Mhz is not suitable for 50Mhz */ - if (host->mmc->actual_clock <= 52000000) { + if (mmc->actual_clock <= 52000000) { writel(host->def_tune_para.iocon, host->base + MSDC_IOCON); if (host->top_base) { writel(host->def_tune_para.emmc_top_control, @@ -932,7 +933,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRRDLY, host->hs400_cmd_int_delay); - dev_dbg(host->dev, "sclk: %d, timing: %d\n", host->mmc->actual_clock, + dev_dbg(host->dev, "sclk: %d, timing: %d\n", mmc->actual_clock, timing); } @@ -967,6 +968,7 @@ static inline u32 msdc_cmd_find_resp(struct msdc_host *host, static inline u32 msdc_cmd_prepare_raw_cmd(struct msdc_host *host, struct mmc_request *mrq, struct mmc_command *cmd) { + struct mmc_host *mmc = mmc_from_priv(host); /* rawcmd : * vol_swt << 30 | auto_cmd << 28 | blklen << 16 | go_irq << 15 | * stop << 14 | rw << 13 | dtype << 11 | rsptyp << 7 | brk << 6 | opcode @@ -993,7 +995,7 @@ static inline u32 msdc_cmd_prepare_raw_cmd(struct msdc_host *host, struct mmc_data *data = cmd->data; if (mmc_op_multi(opcode)) { - if (mmc_card_mmc(host->mmc->card) && mrq->sbc && + if (mmc_card_mmc(mmc->card) && mrq->sbc && !(mrq->sbc->arg & 0xFFFF0000)) rawcmd |= 0x2 << 28; /* AutoCMD23 */ } @@ -1070,9 +1072,10 @@ static int msdc_auto_cmd_done(struct msdc_host *host, int events, */ static void msdc_recheck_sdio_irq(struct msdc_host *host) { + struct mmc_host *mmc = mmc_from_priv(host); u32 reg_int, reg_inten, reg_ps; - if (host->mmc->caps & MMC_CAP_SDIO_IRQ) { + if (mmc->caps & MMC_CAP_SDIO_IRQ) { reg_inten = readl(host->base + MSDC_INTEN); if (reg_inten & MSDC_INTEN_SDIOIRQ) { reg_int = readl(host->base + MSDC_INT); @@ -1080,7 +1083,7 @@ static void msdc_recheck_sdio_irq(struct msdc_host *host) if (!(reg_int & MSDC_INT_SDIOIRQ || reg_ps & MSDC_PS_DATA1)) { __msdc_enable_sdio_irq(host, 0); - sdio_signal_irq(host->mmc); + sdio_signal_irq(mmc); } } } @@ -1113,7 +1116,7 @@ static void msdc_request_done(struct msdc_host *host, struct mmc_request *mrq) msdc_unprepare_data(host, mrq); if (host->error) msdc_reset_hw(host); - mmc_request_done(host->mmc, mrq); + mmc_request_done(mmc_from_priv(host), mrq); if (host->dev_comp->recheck_sdio_irq) msdc_recheck_sdio_irq(host); } @@ -1500,6 +1503,7 @@ static void msdc_enable_sdio_irq(struct mmc_host *mmc, int enb) static irqreturn_t msdc_cmdq_irq(struct msdc_host *host, u32 intsts) { + struct mmc_host *mmc = mmc_from_priv(host); int cmd_err = 0, dat_err = 0; if (intsts & MSDC_INT_RSPCRCERR) { @@ -1523,12 +1527,13 @@ static irqreturn_t msdc_cmdq_irq(struct msdc_host *host, u32 intsts) cmd_err, dat_err, intsts); } - return cqhci_irq(host->mmc, 0, cmd_err, dat_err); + return cqhci_irq(mmc, 0, cmd_err, dat_err); } static irqreturn_t msdc_irq(int irq, void *dev_id) { struct msdc_host *host = (struct msdc_host *) dev_id; + struct mmc_host *mmc = mmc_from_priv(host); while (true) { unsigned long flags; @@ -1551,18 +1556,18 @@ static irqreturn_t msdc_irq(int irq, void *dev_id) spin_unlock_irqrestore(&host->lock, flags); if ((events & event_mask) & MSDC_INT_SDIOIRQ) - sdio_signal_irq(host->mmc); + sdio_signal_irq(mmc); if ((events & event_mask) & MSDC_INT_CDSC) { if (host->internal_cd) - mmc_detect_change(host->mmc, msecs_to_jiffies(20)); + mmc_detect_change(mmc, msecs_to_jiffies(20)); events &= ~MSDC_INT_CDSC; } if (!(events & (event_mask & ~MSDC_INT_SDIOIRQ))) break; - if ((host->mmc->caps2 & MMC_CAP2_CQE) && + if ((mmc->caps2 & MMC_CAP2_CQE) && (events & MSDC_INT_CMDQ)) { msdc_cmdq_irq(host, events); /* clear interrupts */ @@ -2290,6 +2295,26 @@ static void msdc_cqe_disable(struct mmc_host *mmc, bool recovery) } } +static void msdc_cqe_pre_enable(struct mmc_host *mmc) +{ + struct cqhci_host *cq_host = mmc->cqe_private; + u32 reg; + + reg = cqhci_readl(cq_host, CQHCI_CFG); + reg |= CQHCI_ENABLE; + cqhci_writel(cq_host, reg, CQHCI_CFG); +} + +static void msdc_cqe_post_disable(struct mmc_host *mmc) +{ + struct cqhci_host *cq_host = mmc->cqe_private; + u32 reg; + + reg = cqhci_readl(cq_host, CQHCI_CFG); + reg &= ~CQHCI_ENABLE; + cqhci_writel(cq_host, reg, CQHCI_CFG); +} + static const struct mmc_host_ops mt_msdc_ops = { .post_req = msdc_post_req, .pre_req = msdc_pre_req, @@ -2309,6 +2334,8 @@ static const struct mmc_host_ops mt_msdc_ops = { static const struct cqhci_host_ops msdc_cmdq_ops = { .enable = msdc_cqe_enable, .disable = msdc_cqe_disable, + .pre_enable = msdc_cqe_pre_enable, + .post_disable = msdc_cqe_post_disable, }; static void msdc_of_property_parse(struct platform_device *pdev, @@ -2434,7 +2461,6 @@ static int msdc_drv_probe(struct platform_device *pdev) host->dev = &pdev->dev; host->dev_comp = of_device_get_match_data(&pdev->dev); - host->mmc = mmc; host->src_clk_freq = clk_get_rate(host->src_clk); /* Set host parameters to mmc */ mmc->ops = &mt_msdc_ops; @@ -2475,7 +2501,7 @@ static int msdc_drv_probe(struct platform_device *pdev) mmc_dev(mmc)->dma_mask = &host->dma_mask; if (mmc->caps2 & MMC_CAP2_CQE) { - host->cq_host = devm_kzalloc(host->mmc->parent, + host->cq_host = devm_kzalloc(mmc->parent, sizeof(*host->cq_host), GFP_KERNEL); if (!host->cq_host) { @@ -2560,7 +2586,7 @@ static int msdc_drv_remove(struct platform_device *pdev) pm_runtime_get_sync(host->dev); platform_set_drvdata(pdev, NULL); - mmc_remove_host(host->mmc); + mmc_remove_host(mmc); msdc_deinit_hw(host); msdc_gate_clock(host); @@ -2572,7 +2598,7 @@ static int msdc_drv_remove(struct platform_device *pdev) dma_free_coherent(&pdev->dev, MAX_BD_NUM * sizeof(struct mt_bdma_desc), host->dma.bd, host->dma.bd_addr); - mmc_free_host(host->mmc); + mmc_free_host(mmc); return 0; } @@ -2607,6 +2633,7 @@ static void msdc_save_reg(struct msdc_host *host) static void msdc_restore_reg(struct msdc_host *host) { + struct mmc_host *mmc = mmc_from_priv(host); u32 tune_reg = host->dev_comp->pad_tune_reg; writel(host->save_para.msdc_cfg, host->base + MSDC_CFG); @@ -2631,7 +2658,7 @@ static void msdc_restore_reg(struct msdc_host *host) writel(host->save_para.pad_tune, host->base + tune_reg); } - if (sdio_irq_claimed(host->mmc)) + if (sdio_irq_claimed(mmc)) __msdc_enable_sdio_irq(host, 1); } @@ -2667,6 +2694,7 @@ static struct platform_driver mt_msdc_driver = { .remove = msdc_drv_remove, .driver = { .name = "mtk-msdc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = msdc_of_ids, .pm = &msdc_dev_pm_ops, }, diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index cc0752a9df6d..629efbe639c4 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -824,6 +824,7 @@ static struct platform_driver mvsd_driver = { .remove = mvsd_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = mvsdio_dt_ids, }, }; diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index b3d654c688e5..12ee07285980 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -1244,6 +1244,7 @@ static struct platform_driver mxcmci_driver = { .id_table = mxcmci_devtype, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &mxcmci_pm_ops, .of_match_table = mxcmci_of_match, } diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index b1820def36c0..75007f61df97 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -726,6 +726,7 @@ static struct platform_driver mxs_mmc_driver = { .id_table = mxs_ssp_ids, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &mxs_mmc_pm_ops, .of_match_table = mxs_mmc_dt_ids, }, diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 33d7af7c7762..6aa0537f1f84 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1504,6 +1504,7 @@ static struct platform_driver mmc_omap_driver = { .remove = mmc_omap_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(mmc_omap_match), }, }; diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 37b8740513f5..aa9cc49206d1 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1114,8 +1114,7 @@ static int omap_hsmmc_switch_opcond(struct omap_hsmmc_host *host, int vdd) int ret; /* Disable the clocks */ - if (host->dbclk) - clk_disable_unprepare(host->dbclk); + clk_disable_unprepare(host->dbclk); /* Turn the power off */ ret = omap_hsmmc_set_power(host, 0); @@ -1123,8 +1122,7 @@ static int omap_hsmmc_switch_opcond(struct omap_hsmmc_host *host, int vdd) /* Turn the power ON with given VDD 1.8 or 3.0v */ if (!ret) ret = omap_hsmmc_set_power(host, 1); - if (host->dbclk) - clk_prepare_enable(host->dbclk); + clk_prepare_enable(host->dbclk); if (ret != 0) goto err; @@ -2014,8 +2012,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) pm_runtime_dont_use_autosuspend(host->dev); pm_runtime_put_sync(host->dev); pm_runtime_disable(host->dev); - if (host->dbclk) - clk_disable_unprepare(host->dbclk); + clk_disable_unprepare(host->dbclk); err1: mmc_free_host(mmc); err: @@ -2037,8 +2034,7 @@ static int omap_hsmmc_remove(struct platform_device *pdev) pm_runtime_put_sync(host->dev); pm_runtime_disable(host->dev); device_init_wakeup(&pdev->dev, false); - if (host->dbclk) - clk_disable_unprepare(host->dbclk); + clk_disable_unprepare(host->dbclk); mmc_free_host(host->mmc); @@ -2063,8 +2059,7 @@ static int omap_hsmmc_suspend(struct device *dev) OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP); } - if (host->dbclk) - clk_disable_unprepare(host->dbclk); + clk_disable_unprepare(host->dbclk); pm_runtime_put_sync(host->dev); return 0; @@ -2080,8 +2075,7 @@ static int omap_hsmmc_resume(struct device *dev) pm_runtime_get_sync(host->dev); - if (host->dbclk) - clk_prepare_enable(host->dbclk); + clk_prepare_enable(host->dbclk); if (!(host->mmc->pm_flags & MMC_PM_KEEP_POWER)) omap_hsmmc_conf_bus_power(host); @@ -2171,6 +2165,7 @@ static struct platform_driver omap_hsmmc_driver = { .remove = omap_hsmmc_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &omap_hsmmc_dev_pm_ops, .of_match_table = of_match_ptr(omap_mmc_of_match), }, diff --git a/drivers/mmc/host/owl-mmc.c b/drivers/mmc/host/owl-mmc.c index df43f42855e2..ccf214a89eda 100644 --- a/drivers/mmc/host/owl-mmc.c +++ b/drivers/mmc/host/owl-mmc.c @@ -689,6 +689,7 @@ MODULE_DEVICE_TABLE(of, owl_mmc_of_match); static struct platform_driver owl_mmc_driver = { .driver = { .name = "owl_mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = owl_mmc_of_match, }, .probe = owl_mmc_probe, diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index 3a9333475a2b..29f6180a0036 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -811,6 +811,7 @@ static struct platform_driver pxamci_driver = { .remove = pxamci_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(pxa_mmc_dt_ids), }, }; diff --git a/drivers/mmc/host/renesas_sdhi.h b/drivers/mmc/host/renesas_sdhi.h index 14c64caefc64..cb962c7883dc 100644 --- a/drivers/mmc/host/renesas_sdhi.h +++ b/drivers/mmc/host/renesas_sdhi.h @@ -33,10 +33,13 @@ struct renesas_sdhi_of_data { unsigned short max_segs; }; +#define SDHI_CALIB_TABLE_MAX 32 + struct renesas_sdhi_quirks { bool hs400_disabled; bool hs400_4taps; u32 hs400_bad_taps; + const u8 (*hs400_calib_table)[SDHI_CALIB_TABLE_MAX]; }; struct tmio_mmc_dma { @@ -58,7 +61,8 @@ struct renesas_sdhi { void __iomem *scc_ctl; u32 scc_tappos; u32 scc_tappos_hs400; - bool doing_tune; + const u8 *adjust_hs400_calib_table; + bool needs_adjust_hs400; /* Tuning values: 1 for success, 0 for failure */ DECLARE_BITMAP(taps, BITS_PER_LONG); diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 904f5237d8f7..414314151d0a 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,8 @@ #define SDHI_VER_GEN3_SD 0xcc10 #define SDHI_VER_GEN3_SDMMC 0xcd10 +#define SDHI_GEN3_MMC0_ADDR 0xee140000 + static void renesas_sdhi_sdbuf_width(struct tmio_mmc_host *host, int width) { u32 val; @@ -117,8 +120,12 @@ static unsigned int renesas_sdhi_clk_update(struct tmio_mmc_host *host, unsigned int freq, diff, best_freq = 0, diff_min = ~0; int i; - /* tested only on R-Car Gen2+ currently; may work for others */ - if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2)) + /* + * We simply return the current rate if a) we are not on a R-Car Gen2+ + * SoC (may work for others, but untested) or b) if the SCC needs its + * clock during tuning, so we don't change the external clock setup. + */ + if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2) || mmc_doing_tune(host->mmc)) return clk_get_rate(priv->clk); /* @@ -247,6 +254,11 @@ static int renesas_sdhi_start_signal_voltage_switch(struct mmc_host *mmc, #define SH_MOBILE_SDHI_SCC_RVSREQ 0x00A #define SH_MOBILE_SDHI_SCC_SMPCMP 0x00C #define SH_MOBILE_SDHI_SCC_TMPPORT2 0x00E +#define SH_MOBILE_SDHI_SCC_TMPPORT3 0x014 +#define SH_MOBILE_SDHI_SCC_TMPPORT4 0x016 +#define SH_MOBILE_SDHI_SCC_TMPPORT5 0x018 +#define SH_MOBILE_SDHI_SCC_TMPPORT6 0x01A +#define SH_MOBILE_SDHI_SCC_TMPPORT7 0x01C #define SH_MOBILE_SDHI_SCC_DTCNTL_TAPEN BIT(0) #define SH_MOBILE_SDHI_SCC_DTCNTL_TAPNUM_SHIFT 16 @@ -267,6 +279,40 @@ static int renesas_sdhi_start_signal_voltage_switch(struct mmc_host *mmc, #define SH_MOBILE_SDHI_SCC_TMPPORT2_HS400OSEL BIT(4) #define SH_MOBILE_SDHI_SCC_TMPPORT2_HS400EN BIT(31) +/* Definitions for values the SH_MOBILE_SDHI_SCC_TMPPORT4 register */ +#define SH_MOBILE_SDHI_SCC_TMPPORT4_DLL_ACC_START BIT(0) + +/* Definitions for values the SH_MOBILE_SDHI_SCC_TMPPORT5 register */ +#define SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_RW_SEL_R BIT(8) +#define SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_RW_SEL_W (0 << 8) +#define SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_ADR_MASK 0x3F + +/* Definitions for values the SH_MOBILE_SDHI_SCC register */ +#define SH_MOBILE_SDHI_SCC_TMPPORT_DISABLE_WP_CODE 0xa5000000 +#define SH_MOBILE_SDHI_SCC_TMPPORT_CALIB_CODE_MASK 0x1f +#define SH_MOBILE_SDHI_SCC_TMPPORT_MANUAL_MODE BIT(7) + +static const u8 r8a7796_es13_calib_table[2][SDHI_CALIB_TABLE_MAX] = { + { 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, 6, 7, 8, 9, 10, 15, + 16, 16, 16, 16, 16, 16, 17, 18, 18, 19, 20, 21, 22, 23, 24, 25 }, + { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 8, 11, + 12, 17, 18, 18, 18, 18, 18, 18, 18, 19, 20, 21, 22, 23, 25, 25 } +}; + +static const u8 r8a77965_calib_table[2][SDHI_CALIB_TABLE_MAX] = { + { 1, 2, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31 }, + { 2, 3, 4, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 17, 17, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 31, 31, 31 } +}; + +static const u8 r8a77990_calib_table[2][SDHI_CALIB_TABLE_MAX] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 8, 9, 10, + 11, 12, 13, 15, 16, 17, 17, 18, 18, 19, 20, 22, 24, 25, 26, 26 } +}; + static inline u32 sd_scc_read32(struct tmio_mmc_host *host, struct renesas_sdhi *priv, int addr) { @@ -373,6 +419,9 @@ static void renesas_sdhi_hs400_complete(struct mmc_host *mmc) sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); + + if (priv->adjust_hs400_calib_table) + priv->needs_adjust_hs400 = true; } static void renesas_sdhi_reset_scc(struct tmio_mmc_host *host, @@ -403,6 +452,74 @@ static void renesas_sdhi_disable_scc(struct mmc_host *mmc) sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); } +static u32 sd_scc_tmpport_read32(struct tmio_mmc_host *host, + struct renesas_sdhi *priv, u32 addr) +{ + /* read mode */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT5, + SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_RW_SEL_R | + (SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_ADR_MASK & addr)); + + /* access start and stop */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT4, + SH_MOBILE_SDHI_SCC_TMPPORT4_DLL_ACC_START); + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT4, 0); + + return sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT7); +} + +static void sd_scc_tmpport_write32(struct tmio_mmc_host *host, + struct renesas_sdhi *priv, u32 addr, u32 val) +{ + /* write mode */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT5, + SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_RW_SEL_W | + (SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_ADR_MASK & addr)); + + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT6, val); + + /* access start and stop */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT4, + SH_MOBILE_SDHI_SCC_TMPPORT4_DLL_ACC_START); + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT4, 0); +} + +static void renesas_sdhi_adjust_hs400_mode_enable(struct tmio_mmc_host *host) +{ + struct renesas_sdhi *priv = host_to_priv(host); + u32 calib_code; + + /* disable write protect */ + sd_scc_tmpport_write32(host, priv, 0x00, + SH_MOBILE_SDHI_SCC_TMPPORT_DISABLE_WP_CODE); + /* read calibration code and adjust */ + calib_code = sd_scc_tmpport_read32(host, priv, 0x26); + calib_code &= SH_MOBILE_SDHI_SCC_TMPPORT_CALIB_CODE_MASK; + + sd_scc_tmpport_write32(host, priv, 0x22, + SH_MOBILE_SDHI_SCC_TMPPORT_MANUAL_MODE | + priv->adjust_hs400_calib_table[calib_code]); + + /* set offset value to TMPPORT3, hardcoded to OFFSET0 (= 0x3) for now */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT3, 0x3); + + /* adjustment done, clear flag */ + priv->needs_adjust_hs400 = false; +} + +static void renesas_sdhi_adjust_hs400_mode_disable(struct tmio_mmc_host *host) +{ + struct renesas_sdhi *priv = host_to_priv(host); + + /* disable write protect */ + sd_scc_tmpport_write32(host, priv, 0x00, + SH_MOBILE_SDHI_SCC_TMPPORT_DISABLE_WP_CODE); + /* disable manual calibration */ + sd_scc_tmpport_write32(host, priv, 0x22, 0); + /* clear offset value of TMPPORT3 */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT3, 0); +} + static void renesas_sdhi_reset_hs400_mode(struct tmio_mmc_host *host, struct renesas_sdhi *priv) { @@ -420,6 +537,9 @@ static void renesas_sdhi_reset_hs400_mode(struct tmio_mmc_host *host, SH_MOBILE_SDHI_SCC_TMPPORT2_HS400OSEL) & sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT2)); + if (priv->adjust_hs400_calib_table) + renesas_sdhi_adjust_hs400_mode_disable(host); + sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); } @@ -432,6 +552,37 @@ static int renesas_sdhi_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_io return 0; } +static void renesas_sdhi_reset(struct tmio_mmc_host *host) +{ + struct renesas_sdhi *priv = host_to_priv(host); + + renesas_sdhi_reset_scc(host, priv); + renesas_sdhi_reset_hs400_mode(host, priv); + priv->needs_adjust_hs400 = false; + + sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | + sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); + + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL, + ~SH_MOBILE_SDHI_SCC_RVSCNTL_RVSEN & + sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL)); + + if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) + sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, + TMIO_MASK_INIT_RCAR2); +} + +/* + * This is a temporary workaround! This driver used 'hw_reset' wrongly and the + * fix for that showed a regression. So, we mimic the old behaviour until the + * proper solution is found. + */ +static void renesas_sdhi_hw_reset(struct mmc_host *mmc) +{ + struct tmio_mmc_host *host = mmc_priv(mmc); + renesas_sdhi_reset(host); +} + #define SH_MOBILE_SDHI_MIN_TAP_ROW 3 static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) @@ -441,7 +592,6 @@ static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) unsigned int taps_size = priv->tap_num * 2, min_tap_row; unsigned long *bitmap; - priv->doing_tune = false; sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_RVSREQ, 0); /* @@ -500,10 +650,11 @@ static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) return 0; } -static int renesas_sdhi_execute_tuning(struct tmio_mmc_host *host, u32 opcode) +static int renesas_sdhi_execute_tuning(struct mmc_host *mmc, u32 opcode) { + struct tmio_mmc_host *host = mmc_priv(mmc); struct renesas_sdhi *priv = host_to_priv(host); - int i; + int i, ret; priv->tap_num = renesas_sdhi_init_tuning(host); if (!priv->tap_num) @@ -515,7 +666,6 @@ static int renesas_sdhi_execute_tuning(struct tmio_mmc_host *host, u32 opcode) return -EINVAL; } - priv->doing_tune = true; bitmap_zero(priv->taps, priv->tap_num * 2); bitmap_zero(priv->smpcmp, priv->tap_num * 2); @@ -524,14 +674,17 @@ static int renesas_sdhi_execute_tuning(struct tmio_mmc_host *host, u32 opcode) /* Set sampling clock position */ sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TAPSET, i % priv->tap_num); - if (mmc_send_tuning(host->mmc, opcode, NULL) == 0) + if (mmc_send_tuning(mmc, opcode, NULL) == 0) set_bit(i, priv->taps); if (sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_SMPCMP) == 0) set_bit(i, priv->smpcmp); } - return renesas_sdhi_select_tuning(host); + ret = renesas_sdhi_select_tuning(host); + if (ret < 0) + renesas_sdhi_reset(host); + return ret; } static bool renesas_sdhi_manual_correction(struct tmio_mmc_host *host, bool use_4tap) @@ -621,7 +774,7 @@ static bool renesas_sdhi_check_scc_error(struct tmio_mmc_host *host) !(host->mmc->ios.timing == MMC_TIMING_MMC_HS400 && !use_4tap)) return false; - if (mmc_doing_retune(host->mmc) || priv->doing_tune) + if (mmc_doing_tune(host->mmc)) return false; if (sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL) & @@ -631,27 +784,6 @@ static bool renesas_sdhi_check_scc_error(struct tmio_mmc_host *host) return renesas_sdhi_manual_correction(host, use_4tap); } -static void renesas_sdhi_hw_reset(struct tmio_mmc_host *host) -{ - struct renesas_sdhi *priv; - - priv = host_to_priv(host); - - renesas_sdhi_reset_scc(host, priv); - renesas_sdhi_reset_hs400_mode(host, priv); - - sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | - sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); - - sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL, - ~SH_MOBILE_SDHI_SCC_RVSCNTL_RVSEN & - sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL)); - - if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) - sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, - TMIO_MASK_INIT_RCAR2); -} - static int renesas_sdhi_wait_idle(struct tmio_mmc_host *host, u32 bit) { int timeout = 1000; @@ -711,6 +843,13 @@ static int renesas_sdhi_multi_io_quirk(struct mmc_card *card, return blk_size; } +static void renesas_sdhi_fixup_request(struct tmio_mmc_host *host, struct mmc_request *mrq) +{ + struct renesas_sdhi *priv = host_to_priv(host); + + if (priv->needs_adjust_hs400 && mrq->cmd->opcode == MMC_SEND_STATUS) + renesas_sdhi_adjust_hs400_mode_enable(host); +} static void renesas_sdhi_enable_dma(struct tmio_mmc_host *host, bool enable) { /* Iff regs are 8 byte apart, sdbuf is 64 bit. Otherwise always 32. */ @@ -742,6 +881,21 @@ static const struct renesas_sdhi_quirks sdhi_quirks_bad_taps2367 = { .hs400_bad_taps = BIT(2) | BIT(3) | BIT(6) | BIT(7), }; +static const struct renesas_sdhi_quirks sdhi_quirks_r8a7796_es13 = { + .hs400_4taps = true, + .hs400_bad_taps = BIT(2) | BIT(3) | BIT(6) | BIT(7), + .hs400_calib_table = r8a7796_es13_calib_table, +}; + +static const struct renesas_sdhi_quirks sdhi_quirks_r8a77965 = { + .hs400_bad_taps = BIT(2) | BIT(3) | BIT(6) | BIT(7), + .hs400_calib_table = r8a77965_calib_table, +}; + +static const struct renesas_sdhi_quirks sdhi_quirks_r8a77990 = { + .hs400_calib_table = r8a77990_calib_table, +}; + /* * Note for r8a7796 / r8a774a1: we can't distinguish ES1.1 and 1.2 as of now. * So, we want to treat them equally and only have a match for ES1.2 to enforce @@ -753,10 +907,11 @@ static const struct soc_device_attribute sdhi_quirks_match[] = { { .soc_id = "r8a7795", .revision = "ES2.0", .data = &sdhi_quirks_4tap }, { .soc_id = "r8a7795", .revision = "ES3.*", .data = &sdhi_quirks_bad_taps2367 }, { .soc_id = "r8a7796", .revision = "ES1.[012]", .data = &sdhi_quirks_4tap_nohs400 }, - { .soc_id = "r8a7796", .revision = "ES1.*", .data = &sdhi_quirks_4tap }, + { .soc_id = "r8a7796", .revision = "ES1.*", .data = &sdhi_quirks_r8a7796_es13 }, { .soc_id = "r8a7796", .revision = "ES3.*", .data = &sdhi_quirks_bad_taps1357 }, - { .soc_id = "r8a77965", .data = &sdhi_quirks_bad_taps2367 }, + { .soc_id = "r8a77965", .data = &sdhi_quirks_r8a77965 }, { .soc_id = "r8a77980", .data = &sdhi_quirks_nohs400 }, + { .soc_id = "r8a77990", .data = &sdhi_quirks_r8a77990 }, { /* Sentinel. */ }, }; @@ -862,11 +1017,11 @@ int renesas_sdhi_probe(struct platform_device *pdev, renesas_sdhi_start_signal_voltage_switch; host->sdcard_irq_setbit_mask = TMIO_STAT_ALWAYS_SET_27; - /* SDR and HS200/400 registers requires HW reset */ if (of_data && of_data->scc_offset) { priv->scc_ctl = host->ctl + of_data->scc_offset; + host->reset = renesas_sdhi_reset; + host->ops.hw_reset = renesas_sdhi_hw_reset; host->mmc->caps |= MMC_CAP_HW_RESET; - host->hw_reset = renesas_sdhi_hw_reset; } } @@ -915,6 +1070,14 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (ver == SDHI_VER_GEN2_SDR50) mmc_data->flags &= ~TMIO_MMC_HAVE_CBSY; + if (ver == SDHI_VER_GEN3_SDMMC && quirks && quirks->hs400_calib_table) { + host->fixup_request = renesas_sdhi_fixup_request; + priv->adjust_hs400_calib_table = *( + res->start == SDHI_GEN3_MMC0_ADDR ? + quirks->hs400_calib_table : + quirks->hs400_calib_table + 1); + } + ret = tmio_mmc_host_probe(host); if (ret < 0) goto edisclk; @@ -943,8 +1106,8 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (!hit) dev_warn(&host->pdev->dev, "Unknown clock rate for tuning\n"); - host->execute_tuning = renesas_sdhi_execute_tuning; host->check_retune = renesas_sdhi_check_scc_error; + host->ops.execute_tuning = renesas_sdhi_execute_tuning; host->ops.prepare_hs400_tuning = renesas_sdhi_prepare_hs400_tuning; host->ops.hs400_downgrade = renesas_sdhi_disable_scc; host->ops.hs400_complete = renesas_sdhi_hs400_complete; diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c index 32ab991544ef..fe13e1ea22dc 100644 --- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c @@ -336,10 +336,6 @@ static int renesas_sdhi_internal_dmac_probe(struct platform_device *pdev) if (soc) global_flags |= (unsigned long)soc->data; - dev->dma_parms = devm_kzalloc(dev, sizeof(*dev->dma_parms), GFP_KERNEL); - if (!dev->dma_parms) - return -ENOMEM; - /* value is max of SD_SECCNT. Confirmed by HW engineers */ dma_set_max_seg_size(dev, 0xffffffff); @@ -357,6 +353,7 @@ static const struct dev_pm_ops renesas_sdhi_internal_dmac_dev_pm_ops = { static struct platform_driver renesas_internal_dmac_sdhi_driver = { .driver = { .name = "renesas_sdhi_internal_dmac", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &renesas_sdhi_internal_dmac_dev_pm_ops, .of_match_table = renesas_sdhi_internal_dmac_of_match, }, diff --git a/drivers/mmc/host/renesas_sdhi_sys_dmac.c b/drivers/mmc/host/renesas_sdhi_sys_dmac.c index 13ff023fbee9..c5f789675302 100644 --- a/drivers/mmc/host/renesas_sdhi_sys_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_sys_dmac.c @@ -463,6 +463,7 @@ static const struct dev_pm_ops renesas_sdhi_sys_dmac_dev_pm_ops = { static struct platform_driver renesas_sys_dmac_sdhi_driver = { .driver = { .name = "sh_mobile_sdhi", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &renesas_sdhi_sys_dmac_dev_pm_ops, .of_match_table = renesas_sdhi_sys_dmac_of_match, }, diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c index 2763a376b054..eb395e144207 100644 --- a/drivers/mmc/host/rtsx_pci_sdmmc.c +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c @@ -1471,6 +1471,7 @@ static struct platform_driver rtsx_pci_sdmmc_driver = { .id_table = rtsx_pci_sdmmc_ids, .driver = { .name = DRV_NAME_RTSX_PCI_SDMMC, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; module_platform_driver(rtsx_pci_sdmmc_driver); diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index 7225d9312af8..5fe4528e296e 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -579,7 +579,6 @@ static void sd_normal_rw(struct rtsx_usb_sdmmc *host, static int sd_change_phase(struct rtsx_usb_sdmmc *host, u8 sample_point, int tx) { struct rtsx_ucr *ucr = host->ucr; - int err; dev_dbg(sdmmc_dev(host), "%s: %s sample_point = %d\n", __func__, tx ? "TX" : "RX", sample_point); @@ -601,11 +600,7 @@ static int sd_change_phase(struct rtsx_usb_sdmmc *host, u8 sample_point, int tx) rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CLK_DIV, CLK_CHANGE, 0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SD_CFG1, SD_ASYNC_FIFO_RST, 0); - err = rtsx_usb_send_cmd(ucr, MODE_C, 100); - if (err) - return err; - - return 0; + return rtsx_usb_send_cmd(ucr, MODE_C, 100); } static inline u32 get_phase_point(u32 phase_map, unsigned int idx) @@ -1458,6 +1453,7 @@ static struct platform_driver rtsx_usb_sdmmc_driver = { .id_table = rtsx_usb_sdmmc_ids, .driver = { .name = "rtsx_usb_sdmmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &rtsx_usb_sdmmc_dev_pm_ops, }, }; diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index 444b2769ae2c..e3698aba8dd3 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -150,8 +150,8 @@ static void s3cmci_reset(struct s3cmci_host *host); static void dbg_dumpregs(struct s3cmci_host *host, char *prefix) { - u32 con, pre, cmdarg, cmdcon, cmdsta, r0, r1, r2, r3, timer, bsize; - u32 datcon, datcnt, datsta, fsta, imask; + u32 con, pre, cmdarg, cmdcon, cmdsta, r0, r1, r2, r3, timer; + u32 datcon, datcnt, datsta, fsta; con = readl(host->base + S3C2410_SDICON); pre = readl(host->base + S3C2410_SDIPRE); @@ -163,12 +163,10 @@ static void dbg_dumpregs(struct s3cmci_host *host, char *prefix) r2 = readl(host->base + S3C2410_SDIRSP2); r3 = readl(host->base + S3C2410_SDIRSP3); timer = readl(host->base + S3C2410_SDITIMER); - bsize = readl(host->base + S3C2410_SDIBSIZE); datcon = readl(host->base + S3C2410_SDIDCON); datcnt = readl(host->base + S3C2410_SDIDCNT); datsta = readl(host->base + S3C2410_SDIDSTA); fsta = readl(host->base + S3C2410_SDIFSTA); - imask = readl(host->base + host->sdiimsk); dbg(host, dbg_debug, "%s CON:[%08x] PRE:[%08x] TMR:[%08x]\n", prefix, con, pre, timer); @@ -396,9 +394,6 @@ static void s3cmci_enable_irq(struct s3cmci_host *host, bool more) local_irq_restore(flags); } -/** - * - */ static void s3cmci_disable_irq(struct s3cmci_host *host, bool transfer) { unsigned long flags; @@ -1379,7 +1374,7 @@ static int s3cmci_state_show(struct seq_file *seq, void *v) { struct s3cmci_host *host = seq->private; - seq_printf(seq, "Register base = 0x%08x\n", (u32)host->base); + seq_printf(seq, "Register base = 0x%p\n", host->base); seq_printf(seq, "Clock rate = %ld\n", host->clk_rate); seq_printf(seq, "Prescale = %d\n", host->prescaler); seq_printf(seq, "is2440 = %d\n", host->is2440); @@ -1522,7 +1517,7 @@ static int s3cmci_probe_dt(struct s3cmci_host *host) struct mmc_host *mmc = host->mmc; int ret; - host->is2440 = (int) of_device_get_match_data(&pdev->dev); + host->is2440 = (long) of_device_get_match_data(&pdev->dev); ret = mmc_of_parse(mmc); if (ret) @@ -1809,6 +1804,7 @@ MODULE_DEVICE_TABLE(platform, s3cmci_driver_ids); static struct platform_driver s3cmci_driver = { .driver = { .name = "s3c-sdi", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = s3cmci_dt_match, }, .id_table = s3cmci_driver_ids, diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 284cba11e279..54205e3be9e8 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -662,6 +662,43 @@ static int sdhci_acpi_emmc_amd_probe_slot(struct platform_device *pdev, (host->mmc->caps & MMC_CAP_1_8V_DDR)) host->mmc->caps2 = MMC_CAP2_HS400_1_8V; + /* + * There are two types of presets out in the wild: + * 1) Default/broken presets. + * These presets have two sets of problems: + * a) The clock divisor for SDR12, SDR25, and SDR50 is too small. + * This results in clock frequencies that are 2x higher than + * acceptable. i.e., SDR12 = 25 MHz, SDR25 = 50 MHz, SDR50 = + * 100 MHz.x + * b) The HS200 and HS400 driver strengths don't match. + * By default, the SDR104 preset register has a driver strength of + * A, but the (internal) HS400 preset register has a driver + * strength of B. As part of initializing HS400, HS200 tuning + * needs to be performed. Having different driver strengths + * between tuning and operation is wrong. It results in different + * rise/fall times that lead to incorrect sampling. + * 2) Firmware with properly initialized presets. + * These presets have proper clock divisors. i.e., SDR12 => 12MHz, + * SDR25 => 25 MHz, SDR50 => 50 MHz. Additionally the HS200 and + * HS400 preset driver strengths match. + * + * Enabling presets for HS400 doesn't work for the following reasons: + * 1) sdhci_set_ios has a hard coded list of timings that are used + * to determine if presets should be enabled. + * 2) sdhci_get_preset_value is using a non-standard register to + * read out HS400 presets. The AMD controller doesn't support this + * non-standard register. In fact, it doesn't expose the HS400 + * preset register anywhere in the SDHCI memory map. This results + * in reading a garbage value and using the wrong presets. + * + * Since HS400 and HS200 presets must be identical, we could + * instead use the the SDR104 preset register. + * + * If the above issues are resolved we could remove this quirk for + * firmware that that has valid presets (i.e., SDR12 <= 12 MHz). + */ + host->quirks2 |= SDHCI_QUIRK2_PRESET_VALUE_BROKEN; + host->mmc_host_ops.select_drive_strength = amd_select_drive_strength; host->mmc_host_ops.set_ios = amd_set_ios; host->mmc_host_ops.execute_tuning = amd_sdhci_execute_tuning; @@ -1027,6 +1064,7 @@ static const struct dev_pm_ops sdhci_acpi_pm_ops = { static struct platform_driver sdhci_acpi_driver = { .driver = { .name = "sdhci-acpi", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .acpi_match_table = sdhci_acpi_ids, .pm = &sdhci_acpi_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-bcm-kona.c b/drivers/mmc/host/sdhci-bcm-kona.c index a6c2bd202b45..4d4aac85cc7a 100644 --- a/drivers/mmc/host/sdhci-bcm-kona.c +++ b/drivers/mmc/host/sdhci-bcm-kona.c @@ -324,6 +324,7 @@ static int sdhci_bcm_kona_probe(struct platform_device *pdev) static struct platform_driver sdhci_bcm_kona_driver = { .driver = { .name = "sdhci-kona", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, .of_match_table = sdhci_bcm_kona_of_match, }, diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c index ad01f6451a95..bbf3496f4495 100644 --- a/drivers/mmc/host/sdhci-brcmstb.c +++ b/drivers/mmc/host/sdhci-brcmstb.c @@ -235,13 +235,11 @@ static int sdhci_brcmstb_probe(struct platform_device *pdev) dev_dbg(&pdev->dev, "Probe found match for %s\n", match->compatible); - clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(clk)) { - if (PTR_ERR(clk) == -EPROBE_DEFER) - return -EPROBE_DEFER; - dev_err(&pdev->dev, "Clock not found in Device Tree\n"); - clk = NULL; - } + clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(clk), + "Failed to get clock from Device Tree\n"); + res = clk_prepare_enable(clk); if (res) return res; @@ -328,6 +326,7 @@ MODULE_DEVICE_TABLE(of, sdhci_brcm_of_match); static struct platform_driver sdhci_brcmstb_driver = { .driver = { .name = "sdhci-brcmstb", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, .of_match_table = of_match_ptr(sdhci_brcm_of_match), }, diff --git a/drivers/mmc/host/sdhci-cadence.c b/drivers/mmc/host/sdhci-cadence.c index 4d9f7681817c..6f2de54a5987 100644 --- a/drivers/mmc/host/sdhci-cadence.c +++ b/drivers/mmc/host/sdhci-cadence.c @@ -463,6 +463,7 @@ MODULE_DEVICE_TABLE(of, sdhci_cdns_match); static struct platform_driver sdhci_cdns_driver = { .driver = { .name = "sdhci-cdns", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_cdns_pm_ops, .of_match_table = sdhci_cdns_match, }, diff --git a/drivers/mmc/host/sdhci-cns3xxx.c b/drivers/mmc/host/sdhci-cns3xxx.c index 811eab1b8964..2a29c7a4f308 100644 --- a/drivers/mmc/host/sdhci-cns3xxx.c +++ b/drivers/mmc/host/sdhci-cns3xxx.c @@ -98,6 +98,7 @@ static int sdhci_cns3xxx_probe(struct platform_device *pdev) static struct platform_driver sdhci_cns3xxx_driver = { .driver = { .name = "sdhci-cns3xxx", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, }, .probe = sdhci_cns3xxx_probe, diff --git a/drivers/mmc/host/sdhci-dove.c b/drivers/mmc/host/sdhci-dove.c index fe9da3122fe9..5e5bf82e5976 100644 --- a/drivers/mmc/host/sdhci-dove.c +++ b/drivers/mmc/host/sdhci-dove.c @@ -105,6 +105,7 @@ MODULE_DEVICE_TABLE(of, sdhci_dove_of_match_table); static struct platform_driver sdhci_dove_driver = { .driver = { .name = "sdhci-dove", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, .of_match_table = sdhci_dove_of_match_table, }, diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index d738907a622f..fce8fa7e6b30 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -987,10 +987,20 @@ static int usdhc_execute_tuning(struct mmc_host *mmc, u32 opcode) static void esdhc_prepare_tuning(struct sdhci_host *host, u32 val) { u32 reg; + u8 sw_rst; + int ret; /* FIXME: delay a bit for card to be ready for next tuning due to errors */ mdelay(1); + /* IC suggest to reset USDHC before every tuning command */ + esdhc_clrset_le(host, 0xff, SDHCI_RESET_ALL, SDHCI_SOFTWARE_RESET); + ret = readb_poll_timeout(host->ioaddr + SDHCI_SOFTWARE_RESET, sw_rst, + !(sw_rst & SDHCI_RESET_ALL), 10, 100); + if (ret == -ETIMEDOUT) + dev_warn(mmc_dev(host->mmc), + "warning! RESET_ALL never complete before sending tuning command\n"); + reg = readl(host->ioaddr + ESDHC_MIX_CTRL); reg |= ESDHC_MIX_CTRL_EXE_TUNE | ESDHC_MIX_CTRL_SMPCLK_SEL | ESDHC_MIX_CTRL_FBCLK_SEL; @@ -1367,7 +1377,7 @@ static void sdhci_esdhc_imx_hwinit(struct sdhci_host *host) * response, block the tuning procedure or the first command * after the whole tuning procedure always can't get any response. */ - tmp |= ESDHC_TUNING_CMD_CRC_CHECK_DISABLE; + tmp |= ESDHC_TUNING_CMD_CRC_CHECK_DISABLE; writel(tmp, host->ioaddr + ESDHC_TUNING_CTRL); } else if (imx_data->socdata->flags & ESDHC_FLAG_MAN_TUNING) { /* @@ -1643,10 +1653,8 @@ static int sdhci_esdhc_imx_probe(struct platform_device *pdev) goto disable_ipg_clk; imx_data->pinctrl = devm_pinctrl_get(&pdev->dev); - if (IS_ERR(imx_data->pinctrl)) { - err = PTR_ERR(imx_data->pinctrl); + if (IS_ERR(imx_data->pinctrl)) dev_warn(mmc_dev(host->mmc), "could not get pinctrl\n"); - } if (esdhc_is_usdhc(imx_data)) { host->quirks2 |= SDHCI_QUIRK2_PRESET_VALUE_BROKEN; @@ -1917,6 +1925,7 @@ static const struct dev_pm_ops sdhci_esdhc_pmops = { static struct platform_driver sdhci_esdhc_imx_driver = { .driver = { .name = "sdhci-esdhc-imx", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = imx_esdhc_dt_ids, .pm = &sdhci_esdhc_pmops, }, diff --git a/drivers/mmc/host/sdhci-esdhc-mcf.c b/drivers/mmc/host/sdhci-esdhc-mcf.c index 71bf086a9812..ca7a1690b2a8 100644 --- a/drivers/mmc/host/sdhci-esdhc-mcf.c +++ b/drivers/mmc/host/sdhci-esdhc-mcf.c @@ -509,6 +509,7 @@ static int sdhci_esdhc_mcf_remove(struct platform_device *pdev) static struct platform_driver sdhci_esdhc_mcf_driver = { .driver = { .name = "sdhci-esdhc-mcf", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, .probe = sdhci_esdhc_mcf_probe, .remove = sdhci_esdhc_mcf_remove, diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c index e2d8dfe90077..c9434b461aab 100644 --- a/drivers/mmc/host/sdhci-iproc.c +++ b/drivers/mmc/host/sdhci-iproc.c @@ -283,6 +283,7 @@ static const struct sdhci_pltfm_data sdhci_bcm2711_pltfm_data = { static const struct sdhci_iproc_data bcm2711_data = { .pdata = &sdhci_bcm2711_pltfm_data, + .mmc_caps = MMC_CAP_3_3V_DDR, }; static const struct of_device_id sdhci_iproc_of_match[] = { @@ -368,6 +369,7 @@ static int sdhci_iproc_probe(struct platform_device *pdev) static struct platform_driver sdhci_iproc_driver = { .driver = { .name = "sdhci-iproc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_iproc_of_match, .acpi_match_table = ACPI_PTR(sdhci_iproc_acpi_ids), .pm = &sdhci_pltfm_pmops, diff --git a/drivers/mmc/host/sdhci-milbeaut.c b/drivers/mmc/host/sdhci-milbeaut.c index 4e7cc0680f94..148b37ac6564 100644 --- a/drivers/mmc/host/sdhci-milbeaut.c +++ b/drivers/mmc/host/sdhci-milbeaut.c @@ -333,6 +333,7 @@ static int sdhci_milbeaut_remove(struct platform_device *pdev) static struct platform_driver sdhci_milbeaut_driver = { .driver = { .name = "sdhci-milbeaut", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(mlb_dt_ids), }, .probe = sdhci_milbeaut_probe, diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 729868abd2db..3451eb325513 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -263,7 +263,6 @@ struct sdhci_msm_host { unsigned long clk_rate; struct mmc_host *mmc; struct opp_table *opp_table; - bool has_opp_table; bool use_14lpp_dll_reset; bool tuning_done; bool calibration_done; @@ -2167,6 +2166,7 @@ static const struct of_device_id sdhci_msm_dt_match[] = { {.compatible = "qcom,sdhci-msm-v5", .data = &sdhci_msm_v5_var}, {.compatible = "qcom,sdm845-sdhci", .data = &sdm845_sdhci_var}, {.compatible = "qcom,sm8250-sdhci", .data = &sm8250_sdhci_var}, + {.compatible = "qcom,sc7180-sdhci", .data = &sdm845_sdhci_var}, {}, }; @@ -2301,11 +2301,9 @@ static int sdhci_msm_probe(struct platform_device *pdev) /* OPP table is optional */ ret = dev_pm_opp_of_add_table(&pdev->dev); - if (!ret) { - msm_host->has_opp_table = true; - } else if (ret != -ENODEV) { + if (ret && ret != -ENODEV) { dev_err(&pdev->dev, "Invalid OPP table in Device tree\n"); - goto opp_cleanup; + goto opp_put_clkname; } /* Vote for maximum clock rate for maximum performance */ @@ -2469,8 +2467,8 @@ static int sdhci_msm_probe(struct platform_device *pdev) clk_bulk_disable_unprepare(ARRAY_SIZE(msm_host->bulk_clks), msm_host->bulk_clks); opp_cleanup: - if (msm_host->has_opp_table) - dev_pm_opp_of_remove_table(&pdev->dev); + dev_pm_opp_of_remove_table(&pdev->dev); +opp_put_clkname: dev_pm_opp_put_clkname(msm_host->opp_table); bus_clk_disable: if (!IS_ERR(msm_host->bus_clk)) @@ -2490,8 +2488,7 @@ static int sdhci_msm_remove(struct platform_device *pdev) sdhci_remove_host(host, dead); - if (msm_host->has_opp_table) - dev_pm_opp_of_remove_table(&pdev->dev); + dev_pm_opp_of_remove_table(&pdev->dev); dev_pm_opp_put_clkname(msm_host->opp_table); pm_runtime_get_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); @@ -2557,6 +2554,7 @@ static struct platform_driver sdhci_msm_driver = { .name = "sdhci_msm", .of_match_table = sdhci_msm_dt_match, .pm = &sdhci_msm_pm_ops, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c index f186fbd016b1..829ccef87426 100644 --- a/drivers/mmc/host/sdhci-of-arasan.c +++ b/drivers/mmc/host/sdhci-of-arasan.c @@ -1543,10 +1543,9 @@ static int sdhci_arasan_probe(struct platform_device *pdev) of_node_put(node); if (IS_ERR(sdhci_arasan->soc_ctl_base)) { - ret = PTR_ERR(sdhci_arasan->soc_ctl_base); - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, "Can't get syscon: %d\n", - ret); + ret = dev_err_probe(&pdev->dev, + PTR_ERR(sdhci_arasan->soc_ctl_base), + "Can't get syscon\n"); goto err_pltfm_free; } } @@ -1694,6 +1693,7 @@ static int sdhci_arasan_remove(struct platform_device *pdev) static struct platform_driver sdhci_arasan_driver = { .driver = { .name = "sdhci-arasan", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_arasan_of_match, .pm = &sdhci_arasan_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-of-aspeed.c b/drivers/mmc/host/sdhci-of-aspeed.c index a1bcc0f4ba9e..4f008ba3280e 100644 --- a/drivers/mmc/host/sdhci-of-aspeed.c +++ b/drivers/mmc/host/sdhci-of-aspeed.c @@ -240,6 +240,7 @@ static const struct of_device_id aspeed_sdhci_of_match[] = { static struct platform_driver aspeed_sdhci_driver = { .driver = { .name = "sdhci-aspeed", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = aspeed_sdhci_of_match, }, .probe = aspeed_sdhci_probe, @@ -318,6 +319,7 @@ MODULE_DEVICE_TABLE(of, aspeed_sdc_of_match); static struct platform_driver aspeed_sdc_driver = { .driver = { .name = "sd-controller-aspeed", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, .of_match_table = aspeed_sdc_of_match, }, diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 1ece2c50042c..5564d7b23e7c 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -465,6 +465,7 @@ static int sdhci_at91_remove(struct platform_device *pdev) static struct platform_driver sdhci_at91_driver = { .driver = { .name = "sdhci-at91", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_at91_dt_match, .pm = &sdhci_at91_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-of-dwcmshc.c b/drivers/mmc/host/sdhci-of-dwcmshc.c index 64ac0dbee95c..4b673792b5a4 100644 --- a/drivers/mmc/host/sdhci-of-dwcmshc.c +++ b/drivers/mmc/host/sdhci-of-dwcmshc.c @@ -214,6 +214,7 @@ MODULE_DEVICE_TABLE(of, sdhci_dwcmshc_dt_ids); static struct platform_driver sdhci_dwcmshc_driver = { .driver = { .name = "sdhci-dwcmshc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_dwcmshc_dt_ids, .pm = &dwcmshc_pmops, }, diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 45881b309956..0b45eff6fed4 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -1360,13 +1360,19 @@ static void esdhc_init(struct platform_device *pdev, struct sdhci_host *host) clk_put(clk); } - if (esdhc->peripheral_clock) { - esdhc_clock_enable(host, false); - val = sdhci_readl(host, ESDHC_DMA_SYSCTL); + esdhc_clock_enable(host, false); + val = sdhci_readl(host, ESDHC_DMA_SYSCTL); + /* + * This bit is not able to be reset by SDHCI_RESET_ALL. Need to + * initialize it as 1 or 0 once, to override the different value + * which may be configured in bootloader. + */ + if (esdhc->peripheral_clock) val |= ESDHC_PERIPHERAL_CLK_SEL; - sdhci_writel(host, val, ESDHC_DMA_SYSCTL); - esdhc_clock_enable(host, true); - } + else + val &= ~ESDHC_PERIPHERAL_CLK_SEL; + sdhci_writel(host, val, ESDHC_DMA_SYSCTL); + esdhc_clock_enable(host, true); } static int esdhc_hs400_prepare_ddr(struct mmc_host *mmc) @@ -1468,6 +1474,7 @@ static int sdhci_esdhc_probe(struct platform_device *pdev) static struct platform_driver sdhci_esdhc_driver = { .driver = { .name = "sdhci-esdhc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_esdhc_of_match, .pm = &esdhc_of_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-of-hlwd.c b/drivers/mmc/host/sdhci-of-hlwd.c index da844a39af6e..12675797b296 100644 --- a/drivers/mmc/host/sdhci-of-hlwd.c +++ b/drivers/mmc/host/sdhci-of-hlwd.c @@ -80,6 +80,7 @@ MODULE_DEVICE_TABLE(of, sdhci_hlwd_of_match); static struct platform_driver sdhci_hlwd_driver = { .driver = { .name = "sdhci-hlwd", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_hlwd_of_match, .pm = &sdhci_pltfm_pmops, }, diff --git a/drivers/mmc/host/sdhci-of-sparx5.c b/drivers/mmc/host/sdhci-of-sparx5.c new file mode 100644 index 000000000000..28e4ee69e100 --- /dev/null +++ b/drivers/mmc/host/sdhci-of-sparx5.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * drivers/mmc/host/sdhci-of-sparx5.c + * + * MCHP Sparx5 SoC Secure Digital Host Controller Interface. + * + * Copyright (c) 2019 Microchip Inc. + * + * Author: Lars Povlsen + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sdhci-pltfm.h" + +#define CPU_REGS_GENERAL_CTRL (0x22 * 4) +#define MSHC_DLY_CC_MASK GENMASK(16, 13) +#define MSHC_DLY_CC_SHIFT 13 +#define MSHC_DLY_CC_MAX 15 + +#define CPU_REGS_PROC_CTRL (0x2C * 4) +#define ACP_CACHE_FORCE_ENA BIT(4) +#define ACP_AWCACHE BIT(3) +#define ACP_ARCACHE BIT(2) +#define ACP_CACHE_MASK (ACP_CACHE_FORCE_ENA|ACP_AWCACHE|ACP_ARCACHE) + +#define MSHC2_VERSION 0x500 /* Off 0x140, reg 0x0 */ +#define MSHC2_TYPE 0x504 /* Off 0x140, reg 0x1 */ +#define MSHC2_EMMC_CTRL 0x52c /* Off 0x140, reg 0xB */ +#define MSHC2_EMMC_CTRL_EMMC_RST_N BIT(2) +#define MSHC2_EMMC_CTRL_IS_EMMC BIT(0) + +struct sdhci_sparx5_data { + struct sdhci_host *host; + struct regmap *cpu_ctrl; + int delay_clock; +}; + +#define BOUNDARY_OK(addr, len) \ + ((addr | (SZ_128M - 1)) == ((addr + len - 1) | (SZ_128M - 1))) + +/* + * If DMA addr spans 128MB boundary, we split the DMA transfer into two + * so that each DMA transfer doesn't exceed the boundary. + */ +static void sdhci_sparx5_adma_write_desc(struct sdhci_host *host, void **desc, + dma_addr_t addr, int len, + unsigned int cmd) +{ + int tmplen, offset; + + if (likely(!len || BOUNDARY_OK(addr, len))) { + sdhci_adma_write_desc(host, desc, addr, len, cmd); + return; + } + + pr_debug("%s: write_desc: splitting dma len %d, offset %pad\n", + mmc_hostname(host->mmc), len, &addr); + + offset = addr & (SZ_128M - 1); + tmplen = SZ_128M - offset; + sdhci_adma_write_desc(host, desc, addr, tmplen, cmd); + + addr += tmplen; + len -= tmplen; + sdhci_adma_write_desc(host, desc, addr, len, cmd); +} + +static void sparx5_set_cacheable(struct sdhci_host *host, u32 value) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_sparx5_data *sdhci_sparx5 = sdhci_pltfm_priv(pltfm_host); + + pr_debug("%s: Set Cacheable = 0x%x\n", mmc_hostname(host->mmc), value); + + /* Update ACP caching attributes in HW */ + regmap_update_bits(sdhci_sparx5->cpu_ctrl, + CPU_REGS_PROC_CTRL, ACP_CACHE_MASK, value); +} + +static void sparx5_set_delay(struct sdhci_host *host, u8 value) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_sparx5_data *sdhci_sparx5 = sdhci_pltfm_priv(pltfm_host); + + pr_debug("%s: Set DLY_CC = %u\n", mmc_hostname(host->mmc), value); + + /* Update DLY_CC in HW */ + regmap_update_bits(sdhci_sparx5->cpu_ctrl, + CPU_REGS_GENERAL_CTRL, + MSHC_DLY_CC_MASK, + (value << MSHC_DLY_CC_SHIFT)); +} + +static void sdhci_sparx5_set_emmc(struct sdhci_host *host) +{ + if (!mmc_card_is_removable(host->mmc)) { + u8 value; + + value = sdhci_readb(host, MSHC2_EMMC_CTRL); + if (!(value & MSHC2_EMMC_CTRL_IS_EMMC)) { + value |= MSHC2_EMMC_CTRL_IS_EMMC; + pr_debug("%s: Set EMMC_CTRL: 0x%08x\n", + mmc_hostname(host->mmc), value); + sdhci_writeb(host, value, MSHC2_EMMC_CTRL); + } + } +} + +static void sdhci_sparx5_reset_emmc(struct sdhci_host *host) +{ + u8 value; + + pr_debug("%s: Toggle EMMC_CTRL.EMMC_RST_N\n", mmc_hostname(host->mmc)); + value = sdhci_readb(host, MSHC2_EMMC_CTRL) & + ~MSHC2_EMMC_CTRL_EMMC_RST_N; + sdhci_writeb(host, value, MSHC2_EMMC_CTRL); + /* For eMMC, minimum is 1us but give it 10us for good measure */ + usleep_range(10, 20); + sdhci_writeb(host, value | MSHC2_EMMC_CTRL_EMMC_RST_N, + MSHC2_EMMC_CTRL); + /* For eMMC, minimum is 200us but give it 300us for good measure */ + usleep_range(300, 400); +} + +static void sdhci_sparx5_reset(struct sdhci_host *host, u8 mask) +{ + pr_debug("%s: *** RESET: mask %d\n", mmc_hostname(host->mmc), mask); + + sdhci_reset(host, mask); + + /* Be sure CARD_IS_EMMC stays set */ + sdhci_sparx5_set_emmc(host); +} + +static const struct sdhci_ops sdhci_sparx5_ops = { + .set_clock = sdhci_set_clock, + .set_bus_width = sdhci_set_bus_width, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .get_max_clock = sdhci_pltfm_clk_get_max_clock, + .reset = sdhci_sparx5_reset, + .adma_write_desc = sdhci_sparx5_adma_write_desc, +}; + +static const struct sdhci_pltfm_data sdhci_sparx5_pdata = { + .quirks = 0, + .quirks2 = SDHCI_QUIRK2_HOST_NO_CMD23 | /* Controller issue */ + SDHCI_QUIRK2_NO_1_8_V, /* No sdr104, ddr50, etc */ + .ops = &sdhci_sparx5_ops, +}; + +static int sdhci_sparx5_probe(struct platform_device *pdev) +{ + int ret; + const char *syscon = "microchip,sparx5-cpu-syscon"; + struct sdhci_host *host; + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_sparx5_data *sdhci_sparx5; + struct device_node *np = pdev->dev.of_node; + u32 value; + u32 extra; + + host = sdhci_pltfm_init(pdev, &sdhci_sparx5_pdata, + sizeof(*sdhci_sparx5)); + + if (IS_ERR(host)) + return PTR_ERR(host); + + /* + * extra adma table cnt for cross 128M boundary handling. + */ + extra = DIV_ROUND_UP_ULL(dma_get_required_mask(&pdev->dev), SZ_128M); + if (extra > SDHCI_MAX_SEGS) + extra = SDHCI_MAX_SEGS; + host->adma_table_cnt += extra; + + pltfm_host = sdhci_priv(host); + sdhci_sparx5 = sdhci_pltfm_priv(pltfm_host); + sdhci_sparx5->host = host; + + pltfm_host->clk = devm_clk_get(&pdev->dev, "core"); + if (IS_ERR(pltfm_host->clk)) { + ret = PTR_ERR(pltfm_host->clk); + dev_err(&pdev->dev, "failed to get core clk: %d\n", ret); + goto free_pltfm; + } + ret = clk_prepare_enable(pltfm_host->clk); + if (ret) + goto free_pltfm; + + if (!of_property_read_u32(np, "microchip,clock-delay", &value) && + (value > 0 && value <= MSHC_DLY_CC_MAX)) + sdhci_sparx5->delay_clock = value; + + sdhci_get_of_property(pdev); + + ret = mmc_of_parse(host->mmc); + if (ret) + goto err_clk; + + sdhci_sparx5->cpu_ctrl = syscon_regmap_lookup_by_compatible(syscon); + if (IS_ERR(sdhci_sparx5->cpu_ctrl)) { + dev_err(&pdev->dev, "No CPU syscon regmap !\n"); + ret = PTR_ERR(sdhci_sparx5->cpu_ctrl); + goto err_clk; + } + + if (sdhci_sparx5->delay_clock >= 0) + sparx5_set_delay(host, sdhci_sparx5->delay_clock); + + if (!mmc_card_is_removable(host->mmc)) { + /* Do a HW reset of eMMC card */ + sdhci_sparx5_reset_emmc(host); + /* Update EMMC_CTRL */ + sdhci_sparx5_set_emmc(host); + /* If eMMC, disable SD and SDIO */ + host->mmc->caps2 |= (MMC_CAP2_NO_SDIO|MMC_CAP2_NO_SD); + } + + ret = sdhci_add_host(host); + if (ret) + goto err_clk; + + /* Set AXI bus master to use un-cached access (for DMA) */ + if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA) && + IS_ENABLED(CONFIG_DMA_DECLARE_COHERENT)) + sparx5_set_cacheable(host, ACP_CACHE_FORCE_ENA); + + pr_debug("%s: SDHC version: 0x%08x\n", + mmc_hostname(host->mmc), sdhci_readl(host, MSHC2_VERSION)); + pr_debug("%s: SDHC type: 0x%08x\n", + mmc_hostname(host->mmc), sdhci_readl(host, MSHC2_TYPE)); + + return ret; + +err_clk: + clk_disable_unprepare(pltfm_host->clk); +free_pltfm: + sdhci_pltfm_free(pdev); + return ret; +} + +static const struct of_device_id sdhci_sparx5_of_match[] = { + { .compatible = "microchip,dw-sparx5-sdhci" }, + { } +}; +MODULE_DEVICE_TABLE(of, sdhci_sparx5_of_match); + +static struct platform_driver sdhci_sparx5_driver = { + .driver = { + .name = "sdhci-sparx5", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, + .of_match_table = sdhci_sparx5_of_match, + .pm = &sdhci_pltfm_pmops, + }, + .probe = sdhci_sparx5_probe, + .remove = sdhci_pltfm_unregister, +}; + +module_platform_driver(sdhci_sparx5_driver); + +MODULE_DESCRIPTION("Sparx5 SDHCI OF driver"); +MODULE_AUTHOR("Lars Povlsen "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c index 1ec74c2d5c17..7893fd3599b6 100644 --- a/drivers/mmc/host/sdhci-omap.c +++ b/drivers/mmc/host/sdhci-omap.c @@ -1297,6 +1297,7 @@ static struct platform_driver sdhci_omap_driver = { .remove = sdhci_omap_remove, .driver = { .name = "sdhci-omap", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_omap_dev_pm_ops, .of_match_table = omap_sdhci_match, }, diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index 914f5184295f..23da7f7fe093 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -516,6 +518,8 @@ struct intel_host { bool rpm_retune_ok; u32 glk_rx_ctrl1; u32 glk_tun_val; + u32 active_ltr; + u32 idle_ltr; }; static const guid_t intel_dsm_guid = @@ -760,6 +764,108 @@ static int intel_execute_tuning(struct mmc_host *mmc, u32 opcode) return 0; } +#define INTEL_ACTIVELTR 0x804 +#define INTEL_IDLELTR 0x808 + +#define INTEL_LTR_REQ BIT(15) +#define INTEL_LTR_SCALE_MASK GENMASK(11, 10) +#define INTEL_LTR_SCALE_1US (2 << 10) +#define INTEL_LTR_SCALE_32US (3 << 10) +#define INTEL_LTR_VALUE_MASK GENMASK(9, 0) + +static void intel_cache_ltr(struct sdhci_pci_slot *slot) +{ + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct sdhci_host *host = slot->host; + + intel_host->active_ltr = readl(host->ioaddr + INTEL_ACTIVELTR); + intel_host->idle_ltr = readl(host->ioaddr + INTEL_IDLELTR); +} + +static void intel_ltr_set(struct device *dev, s32 val) +{ + struct sdhci_pci_chip *chip = dev_get_drvdata(dev); + struct sdhci_pci_slot *slot = chip->slots[0]; + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct sdhci_host *host = slot->host; + u32 ltr; + + pm_runtime_get_sync(dev); + + /* + * Program latency tolerance (LTR) accordingly what has been asked + * by the PM QoS layer or disable it in case we were passed + * negative value or PM_QOS_LATENCY_ANY. + */ + ltr = readl(host->ioaddr + INTEL_ACTIVELTR); + + if (val == PM_QOS_LATENCY_ANY || val < 0) { + ltr &= ~INTEL_LTR_REQ; + } else { + ltr |= INTEL_LTR_REQ; + ltr &= ~INTEL_LTR_SCALE_MASK; + ltr &= ~INTEL_LTR_VALUE_MASK; + + if (val > INTEL_LTR_VALUE_MASK) { + val >>= 5; + if (val > INTEL_LTR_VALUE_MASK) + val = INTEL_LTR_VALUE_MASK; + ltr |= INTEL_LTR_SCALE_32US | val; + } else { + ltr |= INTEL_LTR_SCALE_1US | val; + } + } + + if (ltr == intel_host->active_ltr) + goto out; + + writel(ltr, host->ioaddr + INTEL_ACTIVELTR); + writel(ltr, host->ioaddr + INTEL_IDLELTR); + + /* Cache the values into lpss structure */ + intel_cache_ltr(slot); +out: + pm_runtime_put_autosuspend(dev); +} + +static bool intel_use_ltr(struct sdhci_pci_chip *chip) +{ + switch (chip->pdev->device) { + case PCI_DEVICE_ID_INTEL_BYT_EMMC: + case PCI_DEVICE_ID_INTEL_BYT_EMMC2: + case PCI_DEVICE_ID_INTEL_BYT_SDIO: + case PCI_DEVICE_ID_INTEL_BYT_SD: + case PCI_DEVICE_ID_INTEL_BSW_EMMC: + case PCI_DEVICE_ID_INTEL_BSW_SDIO: + case PCI_DEVICE_ID_INTEL_BSW_SD: + return false; + default: + return true; + } +} + +static void intel_ltr_expose(struct sdhci_pci_chip *chip) +{ + struct device *dev = &chip->pdev->dev; + + if (!intel_use_ltr(chip)) + return; + + dev->power.set_latency_tolerance = intel_ltr_set; + dev_pm_qos_expose_latency_tolerance(dev); +} + +static void intel_ltr_hide(struct sdhci_pci_chip *chip) +{ + struct device *dev = &chip->pdev->dev; + + if (!intel_use_ltr(chip)) + return; + + dev_pm_qos_hide_latency_tolerance(dev); + dev->power.set_latency_tolerance = NULL; +} + static void byt_probe_slot(struct sdhci_pci_slot *slot) { struct mmc_host_ops *ops = &slot->host->mmc_host_ops; @@ -774,6 +880,43 @@ static void byt_probe_slot(struct sdhci_pci_slot *slot) ops->start_signal_voltage_switch = intel_start_signal_voltage_switch; device_property_read_u32(dev, "max-frequency", &mmc->f_max); + + if (!mmc->slotno) { + slot->chip->slots[mmc->slotno] = slot; + intel_ltr_expose(slot->chip); + } +} + +static void byt_add_debugfs(struct sdhci_pci_slot *slot) +{ + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct mmc_host *mmc = slot->host->mmc; + struct dentry *dir = mmc->debugfs_root; + + if (!intel_use_ltr(slot->chip)) + return; + + debugfs_create_x32("active_ltr", 0444, dir, &intel_host->active_ltr); + debugfs_create_x32("idle_ltr", 0444, dir, &intel_host->idle_ltr); + + intel_cache_ltr(slot); +} + +static int byt_add_host(struct sdhci_pci_slot *slot) +{ + int ret = sdhci_add_host(slot->host); + + if (!ret) + byt_add_debugfs(slot); + return ret; +} + +static void byt_remove_slot(struct sdhci_pci_slot *slot, int dead) +{ + struct mmc_host *mmc = slot->host->mmc; + + if (!mmc->slotno) + intel_ltr_hide(slot->chip); } static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot) @@ -855,6 +998,8 @@ static int glk_emmc_add_host(struct sdhci_pci_slot *slot) if (ret) goto cleanup; + byt_add_debugfs(slot); + return 0; cleanup: @@ -1032,6 +1177,8 @@ static const struct sdhci_pci_fixes sdhci_intel_byt_emmc = { #endif .allow_runtime_pm = true, .probe_slot = byt_emmc_probe_slot, + .add_host = byt_add_host, + .remove_slot = byt_remove_slot, .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC | SDHCI_QUIRK_NO_LED, .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | @@ -1045,6 +1192,7 @@ static const struct sdhci_pci_fixes sdhci_intel_glk_emmc = { .allow_runtime_pm = true, .probe_slot = glk_emmc_probe_slot, .add_host = glk_emmc_add_host, + .remove_slot = byt_remove_slot, #ifdef CONFIG_PM_SLEEP .suspend = sdhci_cqhci_suspend, .resume = sdhci_cqhci_resume, @@ -1075,6 +1223,8 @@ static const struct sdhci_pci_fixes sdhci_ni_byt_sdio = { SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .allow_runtime_pm = true, .probe_slot = ni_byt_sdio_probe_slot, + .add_host = byt_add_host, + .remove_slot = byt_remove_slot, .ops = &sdhci_intel_byt_ops, .priv_size = sizeof(struct intel_host), }; @@ -1092,6 +1242,8 @@ static const struct sdhci_pci_fixes sdhci_intel_byt_sdio = { SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .allow_runtime_pm = true, .probe_slot = byt_sdio_probe_slot, + .add_host = byt_add_host, + .remove_slot = byt_remove_slot, .ops = &sdhci_intel_byt_ops, .priv_size = sizeof(struct intel_host), }; @@ -1111,6 +1263,8 @@ static const struct sdhci_pci_fixes sdhci_intel_byt_sd = { .allow_runtime_pm = true, .own_cd_for_runtime_pm = true, .probe_slot = byt_sd_probe_slot, + .add_host = byt_add_host, + .remove_slot = byt_remove_slot, .ops = &sdhci_intel_byt_ops, .priv_size = sizeof(struct intel_host), }; diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c index 5da2b06d84ae..9887485a4134 100644 --- a/drivers/mmc/host/sdhci-pci-gli.c +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -14,6 +14,7 @@ #include #include "sdhci.h" #include "sdhci-pci.h" +#include "cqhci.h" /* Genesys Logic extra registers */ #define SDHCI_GLI_9750_WT 0x800 @@ -81,9 +82,16 @@ #define GLI_9763E_VHS_REV_R 0x0 #define GLI_9763E_VHS_REV_M 0x1 #define GLI_9763E_VHS_REV_W 0x2 +#define PCIE_GLI_9763E_MB 0x888 +#define GLI_9763E_MB_CMDQ_OFF BIT(19) #define PCIE_GLI_9763E_SCR 0x8E0 #define GLI_9763E_SCR_AXI_REQ BIT(9) +#define SDHCI_GLI_9763E_CQE_BASE_ADDR 0x200 +#define GLI_9763E_CQE_TRNS_MODE (SDHCI_TRNS_MULTI | \ + SDHCI_TRNS_BLK_CNT_EN | \ + SDHCI_TRNS_DMA) + #define PCI_GLI_9755_WT 0x800 #define PCI_GLI_9755_WT_EN BIT(0) #define GLI_9755_WT_EN_ON 0x1 @@ -578,6 +586,30 @@ static int sdhci_pci_gli_resume(struct sdhci_pci_chip *chip) return sdhci_pci_resume_host(chip); } + +static int sdhci_cqhci_gli_resume(struct sdhci_pci_chip *chip) +{ + struct sdhci_pci_slot *slot = chip->slots[0]; + int ret; + + ret = sdhci_pci_gli_resume(chip); + if (ret) + return ret; + + return cqhci_resume(slot->host->mmc); +} + +static int sdhci_cqhci_gli_suspend(struct sdhci_pci_chip *chip) +{ + struct sdhci_pci_slot *slot = chip->slots[0]; + int ret; + + ret = cqhci_suspend(slot->host->mmc); + if (ret) + return ret; + + return sdhci_suspend_host(slot->host); +} #endif static void gl9763e_hs400_enhanced_strobe(struct mmc_host *mmc, @@ -614,6 +646,110 @@ static void sdhci_set_gl9763e_signaling(struct sdhci_host *host, sdhci_writew(host, ctrl_2, SDHCI_HOST_CONTROL2); } +static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc) +{ + sdhci_dumpregs(mmc_priv(mmc)); +} + +static void sdhci_gl9763e_cqe_pre_enable(struct mmc_host *mmc) +{ + struct cqhci_host *cq_host = mmc->cqe_private; + u32 value; + + value = cqhci_readl(cq_host, CQHCI_CFG); + value |= CQHCI_ENABLE; + cqhci_writel(cq_host, value, CQHCI_CFG); +} + +static void sdhci_gl9763e_cqe_enable(struct mmc_host *mmc) +{ + struct sdhci_host *host = mmc_priv(mmc); + + sdhci_writew(host, GLI_9763E_CQE_TRNS_MODE, SDHCI_TRANSFER_MODE); + sdhci_cqe_enable(mmc); +} + +static u32 sdhci_gl9763e_cqhci_irq(struct sdhci_host *host, u32 intmask) +{ + int cmd_error = 0; + int data_error = 0; + + if (!sdhci_cqe_irq(host, intmask, &cmd_error, &data_error)) + return intmask; + + cqhci_irq(host->mmc, intmask, cmd_error, data_error); + + return 0; +} + +static void sdhci_gl9763e_cqe_post_disable(struct mmc_host *mmc) +{ + struct sdhci_host *host = mmc_priv(mmc); + struct cqhci_host *cq_host = mmc->cqe_private; + u32 value; + + value = cqhci_readl(cq_host, CQHCI_CFG); + value &= ~CQHCI_ENABLE; + cqhci_writel(cq_host, value, CQHCI_CFG); + sdhci_writew(host, 0x0, SDHCI_TRANSFER_MODE); +} + +static const struct cqhci_host_ops sdhci_gl9763e_cqhci_ops = { + .enable = sdhci_gl9763e_cqe_enable, + .disable = sdhci_cqe_disable, + .dumpregs = sdhci_gl9763e_dumpregs, + .pre_enable = sdhci_gl9763e_cqe_pre_enable, + .post_disable = sdhci_gl9763e_cqe_post_disable, +}; + +static int gl9763e_add_host(struct sdhci_pci_slot *slot) +{ + struct device *dev = &slot->chip->pdev->dev; + struct sdhci_host *host = slot->host; + struct cqhci_host *cq_host; + bool dma64; + int ret; + + ret = sdhci_setup_host(host); + if (ret) + return ret; + + cq_host = devm_kzalloc(dev, sizeof(*cq_host), GFP_KERNEL); + if (!cq_host) { + ret = -ENOMEM; + goto cleanup; + } + + cq_host->mmio = host->ioaddr + SDHCI_GLI_9763E_CQE_BASE_ADDR; + cq_host->ops = &sdhci_gl9763e_cqhci_ops; + + dma64 = host->flags & SDHCI_USE_64_BIT_DMA; + if (dma64) + cq_host->caps |= CQHCI_TASK_DESC_SZ_128; + + ret = cqhci_init(cq_host, host->mmc, dma64); + if (ret) + goto cleanup; + + ret = __sdhci_add_host(host); + if (ret) + goto cleanup; + + return 0; + +cleanup: + sdhci_cleanup_host(host); + return ret; +} + +static void sdhci_gl9763e_reset(struct sdhci_host *host, u8 mask) +{ + if ((host->mmc->caps2 & MMC_CAP2_CQE) && (mask & SDHCI_RESET_ALL) && + host->mmc->cqe_private) + cqhci_deactivate(host->mmc); + sdhci_reset(host, mask); +} + static void gli_set_gl9763e(struct sdhci_pci_slot *slot) { struct pci_dev *pdev = slot->chip->pdev; @@ -636,7 +772,9 @@ static void gli_set_gl9763e(struct sdhci_pci_slot *slot) static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot) { + struct pci_dev *pdev = slot->chip->pdev; struct sdhci_host *host = slot->host; + u32 value; host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_1_8V_DDR | @@ -646,6 +784,11 @@ static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot) MMC_CAP2_HS400_ES | MMC_CAP2_NO_SDIO | MMC_CAP2_NO_SD; + + pci_read_config_dword(pdev, PCIE_GLI_9763E_MB, &value); + if (!(value & GLI_9763E_MB_CMDQ_OFF)) + host->mmc->caps2 |= MMC_CAP2_CQE | MMC_CAP2_CQE_DCMD; + gli_pcie_enable_msi(slot); host->mmc_host_ops.hs400_enhanced_strobe = gl9763e_hs400_enhanced_strobe; @@ -699,9 +842,10 @@ static const struct sdhci_ops sdhci_gl9763e_ops = { .set_clock = sdhci_set_clock, .enable_dma = sdhci_pci_enable_dma, .set_bus_width = sdhci_set_bus_width, - .reset = sdhci_reset, + .reset = sdhci_gl9763e_reset, .set_uhs_signaling = sdhci_set_gl9763e_signaling, .voltage_switch = sdhci_gli_voltage_switch, + .irq = sdhci_gl9763e_cqhci_irq, }; const struct sdhci_pci_fixes sdhci_gl9763e = { @@ -709,6 +853,8 @@ const struct sdhci_pci_fixes sdhci_gl9763e = { .probe_slot = gli_probe_slot_gl9763e, .ops = &sdhci_gl9763e_ops, #ifdef CONFIG_PM_SLEEP - .resume = sdhci_pci_gli_resume, + .resume = sdhci_cqhci_gli_resume, + .suspend = sdhci_cqhci_gli_suspend, #endif + .add_host = gl9763e_add_host, }; diff --git a/drivers/mmc/host/sdhci-pic32.c b/drivers/mmc/host/sdhci-pic32.c index a11e6397d4ff..6ce1519ae177 100644 --- a/drivers/mmc/host/sdhci-pic32.c +++ b/drivers/mmc/host/sdhci-pic32.c @@ -241,6 +241,7 @@ MODULE_DEVICE_TABLE(of, pic32_sdhci_id_table); static struct platform_driver pic32_sdhci_driver = { .driver = { .name = "pic32-sdhci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(pic32_sdhci_id_table), }, .probe = pic32_sdhci_probe, diff --git a/drivers/mmc/host/sdhci-pxav2.c b/drivers/mmc/host/sdhci-pxav2.c index 9282bc4b8c41..f18906b5575f 100644 --- a/drivers/mmc/host/sdhci-pxav2.c +++ b/drivers/mmc/host/sdhci-pxav2.c @@ -226,6 +226,7 @@ static int sdhci_pxav2_probe(struct platform_device *pdev) static struct platform_driver sdhci_pxav2_driver = { .driver = { .name = "sdhci-pxav2", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_pxav2_of_match), .pm = &sdhci_pltfm_pmops, }, diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c index e55037ceda73..a6d89a3f1946 100644 --- a/drivers/mmc/host/sdhci-pxav3.c +++ b/drivers/mmc/host/sdhci-pxav3.c @@ -567,6 +567,7 @@ static const struct dev_pm_ops sdhci_pxav3_pmops = { static struct platform_driver sdhci_pxav3_driver = { .driver = { .name = "sdhci-pxav3", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_pxav3_of_match), .pm = &sdhci_pxav3_pmops, }, diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index 080ced1e63f0..f48a788a9d3d 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -461,7 +461,9 @@ static int sdhci_s3c_parse_dt(struct device *dev, } #endif +#ifdef CONFIG_OF static const struct of_device_id sdhci_s3c_dt_match[]; +#endif static inline struct sdhci_s3c_drv_data *sdhci_s3c_get_driver_data( struct platform_device *pdev) @@ -784,6 +786,7 @@ static struct platform_driver sdhci_s3c_driver = { .id_table = sdhci_s3c_driver_ids, .driver = { .name = "s3c-sdhci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_s3c_dt_match), .pm = &sdhci_s3c_pmops, }, diff --git a/drivers/mmc/host/sdhci-sirf.c b/drivers/mmc/host/sdhci-sirf.c index f4b05dd6c20a..e9b347b3af7e 100644 --- a/drivers/mmc/host/sdhci-sirf.c +++ b/drivers/mmc/host/sdhci-sirf.c @@ -220,6 +220,7 @@ MODULE_DEVICE_TABLE(of, sdhci_sirf_of_match); static struct platform_driver sdhci_sirf_driver = { .driver = { .name = "sdhci-sirf", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_sirf_of_match, .pm = &sdhci_pltfm_pmops, }, diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c index b4b63089a4e2..d463e2fd5b1a 100644 --- a/drivers/mmc/host/sdhci-spear.c +++ b/drivers/mmc/host/sdhci-spear.c @@ -181,6 +181,7 @@ MODULE_DEVICE_TABLE(of, sdhci_spear_id_table); static struct platform_driver sdhci_driver = { .driver = { .name = "sdhci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pm_ops, .of_match_table = of_match_ptr(sdhci_spear_id_table), }, diff --git a/drivers/mmc/host/sdhci-sprd.c b/drivers/mmc/host/sdhci-sprd.c index bafa2e41c8b6..58109c5b53e2 100644 --- a/drivers/mmc/host/sdhci-sprd.c +++ b/drivers/mmc/host/sdhci-sprd.c @@ -387,7 +387,7 @@ static void sdhci_sprd_request_done(struct sdhci_host *host, if (mmc_hsq_finalize_request(host->mmc, mrq)) return; - mmc_request_done(host->mmc, mrq); + mmc_request_done(host->mmc, mrq); } static struct sdhci_ops sdhci_sprd_ops = { @@ -433,7 +433,7 @@ static void sdhci_sprd_request(struct mmc_host *mmc, struct mmc_request *mrq) } static int sdhci_sprd_request_atomic(struct mmc_host *mmc, - struct mmc_request *mrq) + struct mmc_request *mrq) { sdhci_sprd_check_auto_cmd23(mmc, mrq); @@ -787,6 +787,7 @@ static struct platform_driver sdhci_sprd_driver = { .remove = sdhci_sprd_remove, .driver = { .name = "sdhci_sprd_r11", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_sprd_of_match), .pm = &sdhci_sprd_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-st.c b/drivers/mmc/host/sdhci-st.c index 1301cebfc3ea..4e9ff3e828ba 100644 --- a/drivers/mmc/host/sdhci-st.c +++ b/drivers/mmc/host/sdhci-st.c @@ -521,6 +521,7 @@ static struct platform_driver sdhci_st_driver = { .remove = sdhci_st_remove, .driver = { .name = "sdhci-st", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_st_pmops, .of_match_table = of_match_ptr(st_sdhci_match), }, diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 13fbf70b5fde..ed12aacb1c73 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -1660,11 +1660,8 @@ static int sdhci_tegra_probe(struct platform_device *pdev) clk = devm_clk_get(mmc_dev(host->mmc), NULL); if (IS_ERR(clk)) { - rc = PTR_ERR(clk); - - if (rc != -EPROBE_DEFER) - dev_err(&pdev->dev, "failed to get clock: %d\n", rc); - + rc = dev_err_probe(&pdev->dev, PTR_ERR(clk), + "failed to get clock\n"); goto err_clk_get; } clk_prepare_enable(clk); @@ -1785,6 +1782,7 @@ static SIMPLE_DEV_PM_OPS(sdhci_tegra_dev_pm_ops, sdhci_tegra_suspend, static struct platform_driver sdhci_tegra_driver = { .driver = { .name = "sdhci-tegra", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_tegra_dt_match, .pm = &sdhci_tegra_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index 4703cd540c7f..24c978de2a3f 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -677,6 +677,7 @@ MODULE_DEVICE_TABLE(of, sdhci_xenon_dt_ids); static struct platform_driver sdhci_xenon_driver = { .driver = { .name = "xenon-sdhci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_xenon_dt_ids, .pm = &sdhci_xenon_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index f9d24af12396..a64ea143d185 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -2,10 +2,11 @@ /* * sdhci_am654.c - SDHCI driver for TI's AM654 SOCs * - * Copyright (C) 2018 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2018 Texas Instruments Incorporated - https://www.ti.com * */ #include +#include #include #include #include @@ -18,9 +19,11 @@ /* CTL_CFG Registers */ #define CTL_CFG_2 0x14 +#define CTL_CFG_3 0x18 #define SLOTTYPE_MASK GENMASK(31, 30) #define SLOTTYPE_EMBEDDED BIT(30) +#define TUNINGFORSDR50_MASK BIT(13) /* PHY Registers */ #define PHY_CTRL1 0x100 @@ -65,6 +68,14 @@ #define RETRIM_MASK BIT(RETRIM_SHIFT) #define SELDLYTXCLK_SHIFT 17 #define SELDLYTXCLK_MASK BIT(SELDLYTXCLK_SHIFT) +#define SELDLYRXCLK_SHIFT 16 +#define SELDLYRXCLK_MASK BIT(SELDLYRXCLK_SHIFT) +#define ITAPDLYSEL_SHIFT 0 +#define ITAPDLYSEL_MASK GENMASK(4, 0) +#define ITAPDLYENA_SHIFT 8 +#define ITAPDLYENA_MASK BIT(ITAPDLYENA_SHIFT) +#define ITAPCHGWIN_SHIFT 9 +#define ITAPCHGWIN_MASK BIT(ITAPCHGWIN_SHIFT) #define DRIVER_STRENGTH_50_OHM 0x0 #define DRIVER_STRENGTH_33_OHM 0x1 @@ -72,7 +83,7 @@ #define DRIVER_STRENGTH_100_OHM 0x3 #define DRIVER_STRENGTH_40_OHM 0x4 -#define CLOCK_TOO_SLOW_HZ 400000 +#define CLOCK_TOO_SLOW_HZ 50000000 /* Command Queue Host Controller Interface Base address */ #define SDHCI_AM654_CQE_BASE_ADDR 0x200 @@ -84,14 +95,56 @@ static struct regmap_config sdhci_am654_regmap_config = { .fast_io = true, }; +struct timing_data { + const char *otap_binding; + const char *itap_binding; + u32 capability; +}; + +static const struct timing_data td[] = { + [MMC_TIMING_LEGACY] = {"ti,otap-del-sel-legacy", + "ti,itap-del-sel-legacy", + 0}, + [MMC_TIMING_MMC_HS] = {"ti,otap-del-sel-mmc-hs", + "ti,itap-del-sel-mmc-hs", + MMC_CAP_MMC_HIGHSPEED}, + [MMC_TIMING_SD_HS] = {"ti,otap-del-sel-sd-hs", + "ti,itap-del-sel-sd-hs", + MMC_CAP_SD_HIGHSPEED}, + [MMC_TIMING_UHS_SDR12] = {"ti,otap-del-sel-sdr12", + "ti,itap-del-sel-sdr12", + MMC_CAP_UHS_SDR12}, + [MMC_TIMING_UHS_SDR25] = {"ti,otap-del-sel-sdr25", + "ti,itap-del-sel-sdr25", + MMC_CAP_UHS_SDR25}, + [MMC_TIMING_UHS_SDR50] = {"ti,otap-del-sel-sdr50", + NULL, + MMC_CAP_UHS_SDR50}, + [MMC_TIMING_UHS_SDR104] = {"ti,otap-del-sel-sdr104", + NULL, + MMC_CAP_UHS_SDR104}, + [MMC_TIMING_UHS_DDR50] = {"ti,otap-del-sel-ddr50", + NULL, + MMC_CAP_UHS_DDR50}, + [MMC_TIMING_MMC_DDR52] = {"ti,otap-del-sel-ddr52", + "ti,itap-del-sel-ddr52", + MMC_CAP_DDR}, + [MMC_TIMING_MMC_HS200] = {"ti,otap-del-sel-hs200", + NULL, + MMC_CAP2_HS200}, + [MMC_TIMING_MMC_HS400] = {"ti,otap-del-sel-hs400", + NULL, + MMC_CAP2_HS400}, +}; + struct sdhci_am654_data { struct regmap *base; bool legacy_otapdly; - int otap_del_sel[11]; + int otap_del_sel[ARRAY_SIZE(td)]; + int itap_del_sel[ARRAY_SIZE(td)]; int clkbuf_sel; int trm_icp; int drv_strength; - bool dll_on; int strb_sel; u32 flags; }; @@ -106,26 +159,6 @@ struct sdhci_am654_driver_data { #define DLL_CALIB (1 << 4) }; -struct timing_data { - const char *binding; - u32 capability; -}; - -static const struct timing_data td[] = { - [MMC_TIMING_LEGACY] = {"ti,otap-del-sel-legacy", 0}, - [MMC_TIMING_MMC_HS] = {"ti,otap-del-sel-mmc-hs", MMC_CAP_MMC_HIGHSPEED}, - [MMC_TIMING_SD_HS] = {"ti,otap-del-sel-sd-hs", MMC_CAP_SD_HIGHSPEED}, - [MMC_TIMING_UHS_SDR12] = {"ti,otap-del-sel-sdr12", MMC_CAP_UHS_SDR12}, - [MMC_TIMING_UHS_SDR25] = {"ti,otap-del-sel-sdr25", MMC_CAP_UHS_SDR25}, - [MMC_TIMING_UHS_SDR50] = {"ti,otap-del-sel-sdr50", MMC_CAP_UHS_SDR50}, - [MMC_TIMING_UHS_SDR104] = {"ti,otap-del-sel-sdr104", - MMC_CAP_UHS_SDR104}, - [MMC_TIMING_UHS_DDR50] = {"ti,otap-del-sel-ddr50", MMC_CAP_UHS_DDR50}, - [MMC_TIMING_MMC_DDR52] = {"ti,otap-del-sel-ddr52", MMC_CAP_DDR}, - [MMC_TIMING_MMC_HS200] = {"ti,otap-del-sel-hs200", MMC_CAP2_HS200}, - [MMC_TIMING_MMC_HS400] = {"ti,otap-del-sel-hs400", MMC_CAP2_HS400}, -}; - static void sdhci_am654_setup_dll(struct sdhci_host *host, unsigned int clock) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); @@ -134,6 +167,10 @@ static void sdhci_am654_setup_dll(struct sdhci_host *host, unsigned int clock) u32 mask, val; int ret; + /* Disable delay chain mode */ + regmap_update_bits(sdhci_am654->base, PHY_CTRL5, + SELDLYTXCLK_MASK | SELDLYRXCLK_MASK, 0); + if (sdhci_am654->flags & FREQSEL_2_BIT) { switch (clock) { case 200000000: @@ -188,8 +225,32 @@ static void sdhci_am654_setup_dll(struct sdhci_host *host, unsigned int clock) dev_err(mmc_dev(host->mmc), "DLL failed to relock\n"); return; } +} - sdhci_am654->dll_on = true; +static void sdhci_am654_write_itapdly(struct sdhci_am654_data *sdhci_am654, + u32 itapdly) +{ + /* Set ITAPCHGWIN before writing to ITAPDLY */ + regmap_update_bits(sdhci_am654->base, PHY_CTRL4, ITAPCHGWIN_MASK, + 1 << ITAPCHGWIN_SHIFT); + regmap_update_bits(sdhci_am654->base, PHY_CTRL4, ITAPDLYSEL_MASK, + itapdly << ITAPDLYSEL_SHIFT); + regmap_update_bits(sdhci_am654->base, PHY_CTRL4, ITAPCHGWIN_MASK, 0); +} + +static void sdhci_am654_setup_delay_chain(struct sdhci_am654_data *sdhci_am654, + unsigned char timing) +{ + u32 mask, val; + + regmap_update_bits(sdhci_am654->base, PHY_CTRL1, ENDLL_MASK, 0); + + val = 1 << SELDLYTXCLK_SHIFT | 1 << SELDLYRXCLK_SHIFT; + mask = SELDLYTXCLK_MASK | SELDLYRXCLK_MASK; + regmap_update_bits(sdhci_am654->base, PHY_CTRL5, mask, val); + + sdhci_am654_write_itapdly(sdhci_am654, + sdhci_am654->itap_del_sel[timing]); } static void sdhci_am654_set_clock(struct sdhci_host *host, unsigned int clock) @@ -201,11 +262,7 @@ static void sdhci_am654_set_clock(struct sdhci_host *host, unsigned int clock) u32 otap_del_ena; u32 mask, val; - if (sdhci_am654->dll_on) { - regmap_update_bits(sdhci_am654->base, PHY_CTRL1, ENDLL_MASK, 0); - - sdhci_am654->dll_on = false; - } + regmap_update_bits(sdhci_am654->base, PHY_CTRL1, ENDLL_MASK, 0); sdhci_set_clock(host, clock); @@ -233,14 +290,10 @@ static void sdhci_am654_set_clock(struct sdhci_host *host, unsigned int clock) regmap_update_bits(sdhci_am654->base, PHY_CTRL4, mask, val); - if (timing > MMC_TIMING_UHS_SDR25 && clock > CLOCK_TOO_SLOW_HZ) { - regmap_update_bits(sdhci_am654->base, PHY_CTRL5, - SELDLYTXCLK_MASK, 0); + if (timing > MMC_TIMING_UHS_SDR25 && clock >= CLOCK_TOO_SLOW_HZ) sdhci_am654_setup_dll(host, clock); - } else { - regmap_update_bits(sdhci_am654->base, PHY_CTRL5, - SELDLYTXCLK_MASK, 1 << SELDLYTXCLK_SHIFT); - } + else + sdhci_am654_setup_delay_chain(sdhci_am654, timing); regmap_update_bits(sdhci_am654->base, PHY_CTRL5, CLKBUFSEL_MASK, sdhci_am654->clkbuf_sel); @@ -272,9 +325,19 @@ static void sdhci_j721e_4bit_set_clock(struct sdhci_host *host, sdhci_set_clock(host, clock); } +static u8 sdhci_am654_write_power_on(struct sdhci_host *host, u8 val, int reg) +{ + writeb(val, host->ioaddr + reg); + usleep_range(1000, 10000); + return readb(host->ioaddr + reg); +} + +#define MAX_POWER_ON_TIMEOUT 1500000 /* us */ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) { unsigned char timing = host->mmc->ios.timing; + u8 pwr; + int ret; if (reg == SDHCI_HOST_CONTROL) { switch (timing) { @@ -291,6 +354,19 @@ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) } writeb(val, host->ioaddr + reg); + if (reg == SDHCI_POWER_CONTROL && (val & SDHCI_POWER_ON)) { + /* + * Power on will not happen until the card detect debounce + * timer expires. Wait at least 1.5 seconds for the power on + * bit to be set + */ + ret = read_poll_timeout(sdhci_am654_write_power_on, pwr, + pwr & SDHCI_POWER_ON, 0, + MAX_POWER_ON_TIMEOUT, false, host, val, + reg); + if (ret) + dev_warn(mmc_dev(host->mmc), "Power on failed\n"); + } } static int sdhci_am654_execute_tuning(struct mmc_host *mmc, u32 opcode) @@ -322,7 +398,46 @@ static u32 sdhci_am654_cqhci_irq(struct sdhci_host *host, u32 intmask) return 0; } +#define ITAP_MAX 32 +static int sdhci_am654_platform_execute_tuning(struct sdhci_host *host, + u32 opcode) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_am654_data *sdhci_am654 = sdhci_pltfm_priv(pltfm_host); + int cur_val, prev_val = 1, fail_len = 0, pass_window = 0, pass_len; + u32 itap; + + /* Enable ITAPDLY */ + regmap_update_bits(sdhci_am654->base, PHY_CTRL4, ITAPDLYENA_MASK, + 1 << ITAPDLYENA_SHIFT); + + for (itap = 0; itap < ITAP_MAX; itap++) { + sdhci_am654_write_itapdly(sdhci_am654, itap); + + cur_val = !mmc_send_tuning(host->mmc, opcode, NULL); + if (cur_val && !prev_val) + pass_window = itap; + + if (!cur_val) + fail_len++; + + prev_val = cur_val; + } + /* + * Having determined the length of the failing window and start of + * the passing window calculate the length of the passing window and + * set the final value halfway through it considering the range as a + * circular buffer + */ + pass_len = ITAP_MAX - fail_len; + itap = (pass_window + (pass_len >> 1)) % ITAP_MAX; + sdhci_am654_write_itapdly(sdhci_am654, itap); + + return 0; +} + static struct sdhci_ops sdhci_am654_ops = { + .platform_execute_tuning = sdhci_am654_platform_execute_tuning, .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_pltfm_clk_get_max_clock, .set_uhs_signaling = sdhci_set_uhs_signaling, @@ -352,6 +467,7 @@ static const struct sdhci_am654_driver_data sdhci_am654_drvdata = { }; static struct sdhci_ops sdhci_j721e_8bit_ops = { + .platform_execute_tuning = sdhci_am654_platform_execute_tuning, .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_pltfm_clk_get_max_clock, .set_uhs_signaling = sdhci_set_uhs_signaling, @@ -375,6 +491,7 @@ static const struct sdhci_am654_driver_data sdhci_j721e_8bit_drvdata = { }; static struct sdhci_ops sdhci_j721e_4bit_ops = { + .platform_execute_tuning = sdhci_am654_platform_execute_tuning, .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_pltfm_clk_get_max_clock, .set_uhs_signaling = sdhci_set_uhs_signaling, @@ -445,7 +562,7 @@ static int sdhci_am654_get_otap_delay(struct sdhci_host *host, int i; int ret; - ret = device_property_read_u32(dev, td[MMC_TIMING_LEGACY].binding, + ret = device_property_read_u32(dev, td[MMC_TIMING_LEGACY].otap_binding, &sdhci_am654->otap_del_sel[MMC_TIMING_LEGACY]); if (ret) { /* @@ -468,11 +585,11 @@ static int sdhci_am654_get_otap_delay(struct sdhci_host *host, for (i = MMC_TIMING_MMC_HS; i <= MMC_TIMING_MMC_HS400; i++) { - ret = device_property_read_u32(dev, td[i].binding, + ret = device_property_read_u32(dev, td[i].otap_binding, &sdhci_am654->otap_del_sel[i]); if (ret) { dev_dbg(dev, "Couldn't find %s\n", - td[i].binding); + td[i].otap_binding); /* * Remove the corresponding capability * if an otap-del-sel value is not found @@ -482,6 +599,10 @@ static int sdhci_am654_get_otap_delay(struct sdhci_host *host, else host->mmc->caps2 &= ~td[i].capability; } + + if (td[i].itap_binding) + device_property_read_u32(dev, td[i].itap_binding, + &sdhci_am654->itap_del_sel[i]); } return 0; @@ -527,6 +648,10 @@ static int sdhci_am654_init(struct sdhci_host *host) regmap_update_bits(sdhci_am654->base, CTL_CFG_2, SLOTTYPE_MASK, ctl_cfg_2); + /* Enable tuning for SDR50 */ + regmap_update_bits(sdhci_am654->base, CTL_CFG_3, TUNINGFORSDR50_MASK, + TUNINGFORSDR50_MASK); + ret = sdhci_setup_host(host); if (ret) return ret; @@ -614,6 +739,7 @@ static const struct of_device_id sdhci_am654_of_match[] = { }, { /* sentinel */ } }; +MODULE_DEVICE_TABLE(of, sdhci_am654_of_match); static int sdhci_am654_probe(struct platform_device *pdev) { @@ -721,6 +847,7 @@ static int sdhci_am654_remove(struct platform_device *pdev) static struct platform_driver sdhci_am654_driver = { .driver = { .name = "sdhci-am654", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_am654_of_match, }, .probe = sdhci_am654_probe, diff --git a/drivers/mmc/host/sdhci_f_sdh30.c b/drivers/mmc/host/sdhci_f_sdh30.c index 4625cc071b61..3f5977979cf2 100644 --- a/drivers/mmc/host/sdhci_f_sdh30.c +++ b/drivers/mmc/host/sdhci_f_sdh30.c @@ -219,6 +219,7 @@ MODULE_DEVICE_TABLE(acpi, f_sdh30_acpi_ids); static struct platform_driver sdhci_f_sdh30_driver = { .driver = { .name = "f_sdh30", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(f_sdh30_dt_ids), .acpi_match_table = ACPI_PTR(f_sdh30_acpi_ids), .pm = &sdhci_pltfm_pmops, diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 9f53634aa411..e5e457037235 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1562,6 +1562,7 @@ static struct platform_driver sh_mmcif_driver = { .remove = sh_mmcif_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sh_mmcif_dev_pm_ops, .of_match_table = sh_mmcif_of_match, }, diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index 5e95bbc51644..fc62773602ec 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1514,6 +1514,7 @@ static const struct dev_pm_ops sunxi_mmc_pm_ops = { static struct platform_driver sunxi_mmc_driver = { .driver = { .name = "sunxi-mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sunxi_mmc_of_match), .pm = &sunxi_mmc_pm_ops, }, diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c index 93e83ad25976..d2d3b8df1bbe 100644 --- a/drivers/mmc/host/tmio_mmc.c +++ b/drivers/mmc/host/tmio_mmc.c @@ -77,18 +77,10 @@ static void tmio_mmc_set_clock(struct tmio_mmc_host *host, static void tmio_mmc_reset(struct tmio_mmc_host *host) { - /* FIXME - should we set stop clock reg here */ - sd_ctrl_write16(host, CTL_RESET_SD, 0x0000); sd_ctrl_write16(host, CTL_RESET_SDIO, 0x0000); usleep_range(10000, 11000); - sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); sd_ctrl_write16(host, CTL_RESET_SDIO, 0x0001); usleep_range(10000, 11000); - - if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { - sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, host->sdio_irq_mask); - sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001); - } } #ifdef CONFIG_PM_SLEEP @@ -221,6 +213,7 @@ static const struct dev_pm_ops tmio_mmc_dev_pm_ops = { static struct platform_driver tmio_mmc_driver = { .driver = { .name = "tmio-mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &tmio_mmc_dev_pm_ops, }, .probe = tmio_mmc_probe, diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index 0a4f36500add..9546e542619c 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -178,14 +178,8 @@ struct tmio_mmc_host { unsigned int direction, int blk_size); int (*write16_hook)(struct tmio_mmc_host *host, int addr); void (*reset)(struct tmio_mmc_host *host); - void (*hw_reset)(struct tmio_mmc_host *host); bool (*check_retune)(struct tmio_mmc_host *host); - - /* - * Mandatory callback for tuning to occur which is optional for SDR50 - * and mandatory for SDR104. - */ - int (*execute_tuning)(struct tmio_mmc_host *host, u32 opcode); + void (*fixup_request)(struct tmio_mmc_host *host, struct mmc_request *mrq); void (*prepare_hs400_tuning)(struct tmio_mmc_host *host); void (*hs400_downgrade)(struct tmio_mmc_host *host); diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 946fb013c610..2fce0518632d 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -172,24 +172,15 @@ static void tmio_mmc_reset(struct tmio_mmc_host *host) sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); usleep_range(10000, 11000); + if (host->reset) + host->reset(host); + if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, host->sdio_irq_mask); sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001); } } -static void tmio_mmc_hw_reset(struct mmc_host *mmc) -{ - struct tmio_mmc_host *host = mmc_priv(mmc); - - host->reset(host); - - tmio_mmc_abort_dma(host); - - if (host->hw_reset) - host->hw_reset(host); -} - static void tmio_mmc_reset_work(struct work_struct *work) { struct tmio_mmc_host *host = container_of(work, struct tmio_mmc_host, @@ -228,11 +219,12 @@ static void tmio_mmc_reset_work(struct work_struct *work) spin_unlock_irqrestore(&host->lock, flags); - tmio_mmc_hw_reset(host->mmc); + tmio_mmc_reset(host); /* Ready for new calls */ host->mrq = NULL; + tmio_mmc_abort_dma(host); mmc_request_done(host->mmc, mrq); } @@ -720,24 +712,6 @@ static int tmio_mmc_start_data(struct tmio_mmc_host *host, return 0; } -static int tmio_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) -{ - struct tmio_mmc_host *host = mmc_priv(mmc); - int ret; - - if (!host->execute_tuning) - return 0; - - ret = host->execute_tuning(host, opcode); - - if (ret < 0) { - dev_warn(&host->pdev->dev, "Tuning procedure failed\n"); - tmio_mmc_hw_reset(mmc); - } - - return ret; -} - static void tmio_process_mrq(struct tmio_mmc_host *host, struct mmc_request *mrq) { @@ -835,6 +809,9 @@ static void tmio_mmc_finish_request(struct tmio_mmc_host *host) return; } + if (host->fixup_request) + host->fixup_request(host, mrq); + mmc_request_done(host->mmc, mrq); } @@ -1011,8 +988,6 @@ static struct mmc_host_ops tmio_mmc_ops = { .get_cd = tmio_mmc_get_cd, .enable_sdio_irq = tmio_mmc_enable_sdio_irq, .multi_io_quirk = tmio_multi_io_quirk, - .hw_reset = tmio_mmc_hw_reset, - .execute_tuning = tmio_mmc_execute_tuning, }; static int tmio_mmc_init_ocr(struct tmio_mmc_host *host) @@ -1156,9 +1131,6 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host) mmc->caps & MMC_CAP_NEEDS_POLL || !mmc_card_is_removable(mmc)); - if (!_host->reset) - _host->reset = tmio_mmc_reset; - /* * On Gen2+, eMMC with NONREMOVABLE currently fails because native * hotplug gets disabled. It seems RuntimePM related yet we need further @@ -1180,7 +1152,7 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host) _host->sdio_irq_mask = TMIO_SDIO_MASK_ALL; _host->set_clock(_host, 0); - tmio_mmc_hw_reset(mmc); + tmio_mmc_reset(_host); _host->sdcard_irq_mask = sd_ctrl_read16_and_16_as_32(_host, CTL_IRQ_MASK); tmio_mmc_disable_mmc_irqs(_host, TMIO_MASK_ALL); @@ -1283,7 +1255,7 @@ int tmio_mmc_host_runtime_resume(struct device *dev) struct tmio_mmc_host *host = dev_get_drvdata(dev); tmio_mmc_clk_enable(host); - tmio_mmc_hw_reset(host->mmc); + tmio_mmc_reset(host); if (host->clk_cache) host->set_clock(host, host->clk_cache); diff --git a/drivers/mmc/host/uniphier-sd.c b/drivers/mmc/host/uniphier-sd.c index f82baf99fd69..3092466a99ab 100644 --- a/drivers/mmc/host/uniphier-sd.c +++ b/drivers/mmc/host/uniphier-sd.c @@ -409,8 +409,9 @@ static void uniphier_sd_clk_disable(struct tmio_mmc_host *host) clk_disable_unprepare(priv->clk); } -static void uniphier_sd_hw_reset(struct tmio_mmc_host *host) +static void uniphier_sd_hw_reset(struct mmc_host *mmc) { + struct tmio_mmc_host *host = mmc_priv(mmc); struct uniphier_sd_priv *priv = uniphier_sd_priv(host); reset_control_assert(priv->rst_hw); @@ -597,7 +598,7 @@ static int uniphier_sd_probe(struct platform_device *pdev) ret = PTR_ERR(priv->rst_hw); goto free_host; } - host->hw_reset = uniphier_sd_hw_reset; + host->ops.hw_reset = uniphier_sd_hw_reset; } if (host->mmc->caps & MMC_CAP_UHS) { @@ -684,6 +685,7 @@ static struct platform_driver uniphier_sd_driver = { .remove = uniphier_sd_remove, .driver = { .name = "uniphier-sd", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = uniphier_sd_match, }, }; diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 7666c90054ae..e2d5112d809d 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -1890,6 +1890,7 @@ static struct platform_driver usdhi6_driver = { .remove = usdhi6_remove, .driver = { .name = "usdhi6rol0", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = usdhi6_of_match, }, }; diff --git a/drivers/mmc/host/via-sdmmc.c b/drivers/mmc/host/via-sdmmc.c index 49dab9f42b6d..9b755ea0fa03 100644 --- a/drivers/mmc/host/via-sdmmc.c +++ b/drivers/mmc/host/via-sdmmc.c @@ -1257,11 +1257,14 @@ static void __maybe_unused via_init_sdc_pm(struct via_crdr_mmc_host *host) static int __maybe_unused via_sd_suspend(struct device *dev) { struct via_crdr_mmc_host *host; + unsigned long flags; host = dev_get_drvdata(dev); + spin_lock_irqsave(&host->lock, flags); via_save_pcictrlreg(host); via_save_sdcreg(host); + spin_unlock_irqrestore(&host->lock, flags); device_wakeup_enable(dev); diff --git a/drivers/mmc/host/wbsd.c b/drivers/mmc/host/wbsd.c index 67f917d6ecd3..cd63ea865b77 100644 --- a/drivers/mmc/host/wbsd.c +++ b/drivers/mmc/host/wbsd.c @@ -1905,6 +1905,7 @@ static struct platform_driver wbsd_driver = { .resume = wbsd_platform_resume, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c index 2c4ba1fa4bbf..cf10949fb0ac 100644 --- a/drivers/mmc/host/wmt-sdmmc.c +++ b/drivers/mmc/host/wmt-sdmmc.c @@ -990,6 +990,7 @@ static struct platform_driver wmt_mci_driver = { .remove = wmt_mci_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = wmt_mci_pm_ops, .of_match_table = wmt_mci_dt_ids, }, diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 575636f6491e..68b95ad82126 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -14,6 +14,7 @@ config BTRFS_FS select LZO_DECOMPRESS select ZSTD_COMPRESS select ZSTD_DECOMPRESS + select FS_IOMAP select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ea1c28ccb44f..b3268f4ea5f3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, while (!list_empty(&pending_edge)) { struct btrfs_backref_node *upper; struct btrfs_backref_node *lower; - struct rb_node *rb_node; edge = list_first_entry(&pending_edge, struct btrfs_backref_edge, list[UPPER]); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ea8aaf36647e..c0f1d6818df7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache) { struct btrfs_space_info *space_info = cache->space_info; int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); - - if (first) - btrfs_sysfs_add_block_group_type(cache); } static struct btrfs_block_group *btrfs_create_block_group_cache( @@ -1873,7 +1867,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static int read_block_group_item(struct btrfs_block_group *cache, +static void read_block_group_item(struct btrfs_block_group *cache, struct btrfs_path *path, const struct btrfs_key *key) { @@ -1887,8 +1881,6 @@ static int read_block_group_item(struct btrfs_block_group *cache, sizeof(bgi)); cache->used = btrfs_stack_block_group_used(&bgi); cache->flags = btrfs_stack_block_group_flags(&bgi); - - return 0; } static int read_one_block_group(struct btrfs_fs_info *info, @@ -1907,9 +1899,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - ret = read_block_group_item(cache, path, key); - if (ret < 0) - goto error; + read_block_group_item(cache, path, key); set_free_space_tree_thresholds(cache); @@ -2035,8 +2025,18 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } - rcu_read_lock(); - list_for_each_entry_rcu(space_info, &info->space_info, list) { + list_for_each_entry(space_info, &info->space_info, list) { + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (list_empty(&space_info->block_groups[i])) + continue; + cache = list_first_entry(&space_info->block_groups[i], + struct btrfs_block_group, + list); + btrfs_sysfs_add_block_group_type(cache); + } + if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | @@ -2056,7 +2056,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } - rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); @@ -2097,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) return; while (!list_empty(&trans->new_bgs)) { + int index; + block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group, bg_list); if (ret) goto next; + index = btrfs_bg_flags_to_raid_index(block_group->flags); + ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); @@ -2111,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); + + /* + * If we restriped during balance, we may have added a new raid + * type, so now add the sysfs entries when it is safe to do so. + * We don't have to worry about locking here as it's handled in + * btrfs_sysfs_add_block_group_type. + */ + if (block_group->space_info->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(block_group); + /* Already aborted the transaction if it failed. */ next: btrfs_delayed_refs_rsv_release(fs_info, 1); @@ -2785,7 +2798,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) * finished yet (no block group item in the extent tree * yet, etc). If this is the case, wait for all free * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and + * very rare case so no need for a more efficient and * complex approach. */ if (ret == -ENOENT) { @@ -2961,6 +2974,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake + * tickets if that happens + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); @@ -2994,6 +3014,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, if (delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); + + btrfs_try_granting_tickets(cache->fs_info, space_info); spin_unlock(&space_info->lock); } @@ -3002,12 +3024,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) found->force_alloc = CHUNK_ALLOC_FORCE; } - rcu_read_unlock(); } static int should_alloc_chunk(struct btrfs_fs_info *fs_info, @@ -3338,14 +3358,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->block_group_cache_lock); - /* - * Now that all the block groups are freed, go through and free all the - * space_info structs. This is only called during the final stages of - * unmount, and so we know nobody is using them. We call - * synchronize_rcu() once before we start, just to be on the safe side. - */ - synchronize_rcu(); - btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c47b6c6fea9f..92dd86bceae3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -21,14 +21,18 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE, + BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, + /* + * Always set under the VFS' inode lock, otherwise it can cause races + * during fsync (we start as a fast fsync and then end up in a full + * fsync racing with ordered extent completion). + */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, - BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, }; @@ -212,6 +216,11 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) +{ + return inode->root->fs_info->sectorsize; +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); @@ -324,23 +333,6 @@ struct btrfs_dio_private { u8 csums[]; }; -/* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. - */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) -{ - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); -} - -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) -{ - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1ab56a734e70..eeface30facd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -29,41 +29,6 @@ #include "extent_io.h" #include "extent_map.h" -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); -void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); - -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); -void lzo_free_workspace(struct list_head *ws); - -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); -void zstd_put_workspace(struct list_head *ws); - static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9f3dbe372631..8001b700ea3a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cd392da69b81..113da62dc17f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_node_key(buf, &disk_key, 0); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, - &disk_key, level, buf->start, 0); + &disk_key, level, buf->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -957,7 +958,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( const struct btrfs_disk_key *disk_key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *ret; @@ -986,7 +988,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( ret = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, disk_key, level, - hint, empty_size); + hint, empty_size, nest); trans->can_flush_pending_bgs = true; return ret; @@ -1009,7 +1011,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -1040,7 +1043,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1061,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1068,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1100,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(buf); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1446,7 +1455,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; @@ -1485,7 +1495,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); + parent_slot, cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -1657,7 +1667,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize)); + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -1855,7 +1866,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking_write(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); @@ -1891,10 +1903,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = NULL; if (left) { - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); + parent, pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; goto enospc; @@ -1906,10 +1919,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; if (right) { - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); + parent, pslot + 1, &right, + BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; goto enospc; @@ -2069,7 +2083,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (left) { u32 left_nr; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); @@ -2077,7 +2091,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); + pslot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) wret = 1; else { @@ -2123,7 +2138,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (right) { u32 right_nr; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); @@ -2132,7 +2147,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right); + &right, BTRFS_NESTING_RIGHT_COW); if (ret) wret = 1; else { @@ -2601,7 +2616,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + b = __btrfs_read_lock_root_node(root, p->recurse); level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -2740,11 +2755,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, - &b); + &b, + BTRFS_NESTING_COW); else err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], &b); + p->slots[level + 1], &b, + BTRFS_NESTING_COW); if (err) { ret = err; goto done; @@ -2875,7 +2892,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else { if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, + p->recurse); } p->locks[level] = BTRFS_READ_LOCK; } @@ -3163,6 +3181,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, fixup_low_keys(path, &disk_key, 1); } +/* + * Check key order of two sibling extent buffers. + * + * Return true if something is wrong. + * Return false if everything is fine. + * + * Tree-checker only works inside one tree block, thus the following + * corruption can not be detected by tree-checker: + * + * Leaf @left | Leaf @right + * -------------------------------------------------------------- + * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | + * + * Key f6 in leaf @left itself is valid, but not valid when the next + * key in leaf @right is 7. + * This can only be checked at tree block merge time. + * And since tree checker has ensured all key order in each tree block + * is correct, we only need to bother the last key of @left and the first + * key of @right. + */ +static bool check_sibling_keys(struct extent_buffer *left, + struct extent_buffer *right) +{ + struct btrfs_key left_last; + struct btrfs_key right_first; + int level = btrfs_header_level(left); + int nr_left = btrfs_header_nritems(left); + int nr_right = btrfs_header_nritems(right); + + /* No key to check in one of the tree blocks */ + if (!nr_left || !nr_right) + return false; + + if (level) { + btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_node_key_to_cpu(right, &right_first, 0); + } else { + btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_item_key_to_cpu(right, &right_first, 0); + } + + if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + btrfs_crit(left->fs_info, +"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", + left_last.objectid, left_last.type, + left_last.offset, right_first.objectid, + right_first.type, right_first.offset); + return true; + } + return false; +} + /* * try to push data from one node into the next node left in the * tree. @@ -3207,6 +3277,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + /* dst is the left eb, src is the middle eb */ + if (check_sibling_keys(dst, src)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3275,6 +3351,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + /* dst is the right eb, src is the middle eb */ + if (check_sibling_keys(src, dst)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), @@ -3331,7 +3413,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, - root->node->start, 0); + root->node->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); @@ -3461,7 +3544,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_node_key(c, &disk_key, mid); split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, - c->start, 0); + c->start, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); @@ -3730,7 +3813,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return 1; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); @@ -3739,7 +3822,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; @@ -3751,6 +3834,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (path->slots[0] == left_nritems && !empty) { /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete @@ -3963,7 +4052,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return 1; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); @@ -3974,7 +4063,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_LEFT_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) @@ -3988,6 +4078,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + goto out; + } return __push_leaf_left(path, min_data_size, empty, left, free_space, right_nritems, max_slot); @@ -4236,8 +4330,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, else btrfs_item_key(l, &disk_key, mid); + /* + * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double + * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES + * subclasses, which is 8 at the time of this patch, and we've maxed it + * out. In the future we could add a + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0); + l->start, 0, num_doubles ? + BTRFS_NESTING_NEW_ROOT : + BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); @@ -4482,9 +4586,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, new_key, &item_size, 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4657,14 +4759,20 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot +/** + * setup_items_for_insert - Helper called before inserting one or more items + * to a leaf. Main purpose is to save stack depth by doing the bulk of the work + * in a function that doesn't call btrfs_search_slot + * + * @root: root we are inserting items to + * @path: points to the leaf/slot where we are going to insert new items + * @cpu_key: array of keys for items to be inserted + * @data_size: size of the body of each item we are going to insert + * @nr: size of @cpu_key/@data_size arrays */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) + int nr) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -4675,6 +4783,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *leaf; int slot; struct btrfs_map_token token; + u32 total_size; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + total_size = total_data + (nr * sizeof(struct btrfs_item)); if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -4701,7 +4815,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(leaf); - btrfs_crit(fs_info, "slot %d old_data %d data_end %d", + btrfs_crit(fs_info, + "item at slot %d with data offset %u beyond data end of leaf %u", slot, old_data, data_end); BUG(); } @@ -4734,8 +4849,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - btrfs_set_token_item_offset(&token, item, data_end - data_size[i]); data_end -= data_size[i]; + btrfs_set_token_item_offset(&token, item, data_end); btrfs_set_token_item_size(&token, item, data_size[i]); } @@ -4777,8 +4892,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, - total_data, total_size, nr); + setup_items_for_insert(root, path, cpu_key, data_size, nr); return 0; } @@ -5115,7 +5229,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, slot--; /* * check this node pointer against the min_trans parameters. - * If it is too old, old, skip to the next one. + * If it is too old, skip to the next one. */ while (slot < nritems) { u64 gen; @@ -5379,7 +5493,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, } if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_RIGHT, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5414,7 +5530,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_RIGHT, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9a72896bed2e..aac3d6f4e35b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -374,6 +374,7 @@ struct btrfs_path { unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; + unsigned int recurse:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) @@ -494,7 +495,7 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -void btrfs_init_async_reclaim_work(struct work_struct *work); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ struct reloc_control; @@ -540,11 +541,6 @@ enum { BTRFS_FS_QUOTA_OVERRIDE, /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, - /* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ - BTRFS_FS_EXCL_OP, /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. @@ -565,6 +561,19 @@ enum { BTRFS_FS_DISCARD_RUNNING, }; +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -912,6 +921,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; @@ -935,6 +945,9 @@ struct btrfs_fs_info { */ int send_in_progress; + /* Type of exclusive operation running */ + unsigned long exclusive_operation; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -1181,24 +1194,40 @@ struct btrfs_root { #endif }; -struct btrfs_clone_extent_info { +/* + * Structure that conveys information about an extent that is going to replace + * all the extents in a file range. + */ +struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; u64 data_len; u64 file_offset; + /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; - u32 item_size; + /* + * Set to true when attempting to replace a file range with a new extent + * described by this structure, set to false when attempting to clone an + * existing extent into a file range. + */ + bool is_new_extent; + /* Meaningful only if is_new_extent is true. */ + int qgroup_reserved; + /* + * Meaningful only if is_new_extent is true. + * Used to track how many extent items we have already inserted in a + * subvolume tree that refer to the extent described by this structure, + * so that we know when to create a new delayed ref or update an existing + * one. + */ + int insertions; }; struct btrfs_file_private { void *filldir_buf; }; -static inline u32 btrfs_inode_sectorsize(const struct inode *inode) -{ - return btrfs_sb(inode->i_sb)->sectorsize; -} static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { @@ -1391,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token, #define cpu_to_le8(v) (v) #define __le8 u8 +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ @@ -1449,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = page_address(eb->pages[0]); \ - u##bits res = le##bits##_to_cpu(p->member); \ - return res; \ + return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]); \ - p->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ - return le##bits##_to_cpu(s->member); \ + return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ - s->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &s->member); \ } - static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { @@ -2524,7 +2561,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size); + u64 empty_size, + enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2592,6 +2630,8 @@ enum btrfs_reserve_flush_enum { * * Can be interruped by fatal signal. */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, BTRFS_RESERVE_FLUSH_ALL, /* @@ -2619,7 +2659,7 @@ enum btrfs_flush_state { int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); @@ -2651,8 +2691,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); @@ -2665,7 +2703,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret); + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2713,7 +2752,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); + int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, @@ -2930,6 +2969,10 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -2956,7 +2999,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); @@ -3017,6 +3060,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3031,6 +3075,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ int __init btrfs_auto_defrag_init(void); @@ -3053,9 +3100,9 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); @@ -3536,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); - static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0e354e9e57d0..bacee09b7bfd 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; /* Make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, fs_info->sectorsize); - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } + if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; -again: - /* Make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * If we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* Commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - spin_unlock(&data_sinfo->lock); - - return 0; + return btrfs_reserve_data_bytes(fs_info, bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, @@ -277,9 +166,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - spin_unlock(&data_sinfo->lock); + btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); } /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf1595a42a98..5aba81e16113 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_qgroup_reserve_meta_prealloc(root, - fs_info->nodesize, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); if (ret < 0) return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, @@ -769,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, } /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, - total_data_size, total_size, nitems); + setup_items_for_insert(root, path, keys, data_size, nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e4a1c6afe35d..4a0243cb9d97 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -64,10 +64,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) @@ -224,13 +220,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *device; struct block_device *bdev; - struct list_head *devices; struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) { + if (srcdev->fs_devices->seeding) { btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; } @@ -244,8 +239,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -512,7 +506,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_device(tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); @@ -630,6 +624,32 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, return ret; } +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -661,7 +681,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; @@ -781,7 +801,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) btrfs_scratch_superblocks(fs_info, src_device->bdev, @@ -799,32 +819,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, return 0; } -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - u64 start = 0; - int i; - - write_lock(&em_tree->lock); - do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) - break; - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) - if (srcdev == map->stripes[i].dev) - map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); - } while (start); - write_unlock(&em_tree->lock); -} - /* * Read progress of device replace status according to the state and last * stored position. The value format is the same as for @@ -1025,7 +1019,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * should never allow both to start and pause. We don't want to allow * dev-replace to start anyway. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; @@ -1062,7 +1056,7 @@ static int btrfs_dev_replace_kthread(void *data) ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret && ret != -ECANCELED); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9f72b092bc22..764001609a15 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,7 +50,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -204,53 +203,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, #endif -/* - * extents on the btree inode are pretty simple, there's one extent - * that covers the entire device - */ -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (em) { - read_unlock(&em_tree->lock); - goto out; - } - read_unlock(&em_tree->lock); - - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - em->start = 0; - em->len = (u64)-1; - em->block_len = (u64)-1; - em->block_start = 0; - - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - if (ret == -EEXIST) { - free_extent_map(em); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) - em = ERR_PTR(-EIO); - } else if (ret) { - free_extent_map(em); - em = ERR_PTR(ret); - } - write_unlock(&em_tree->lock); - -out: - return em; -} - /* * Compute the csum of a btree block and store the result to provided buffer. */ @@ -545,38 +497,35 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - int ret = 1; + u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); - while (fs_devices) { - u8 *metadata_uuid; + /* + * Checking the incompat flag is only valid for the current fs. For + * seed devices it's forbidden to have their uuid changed so reading + * ->fsid in this case is fine + */ + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; - /* - * Checking the incompat flag is only valid for the current - * fs. For seed devices it's forbidden to have their uuid - * changed so reading ->fsid in this case is fine - */ - if (fs_devices == fs_info->fs_devices && - btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + return 0; - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { - ret = 0; - break; - } - fs_devices = fs_devices->seed; - } - return ret; + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) + return 0; + + return 1; } -static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror) { u64 found_start; int found_level; @@ -864,9 +813,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info, return 1; } -static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(fs_info, BTRFS_I(inode)); @@ -951,11 +899,6 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_readpage(struct file *file, struct page *page) -{ - return extent_read_full_page(page, btree_get_extent, 0); -} - static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) @@ -995,7 +938,6 @@ static int btree_set_page_dirty(struct page *page) } static const struct address_space_operations btree_aops = { - .readpage = btree_readpage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, @@ -1208,7 +1150,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1280,7 +1223,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0); + NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { btrfs_put_root(root); return ERR_CAST(leaf); @@ -1505,10 +1448,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; while (!list_empty(&fs_info->allocated_roots)) { + char buf[BTRFS_ROOT_NAME_BUF_LEN]; + root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); - btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", - root->root_key.objectid, root->root_key.offset, + btrfs_err(fs_info, "leaked root %s refcount %d", + btrfs_root_name(root->root_key.objectid, buf), refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); @@ -2115,12 +2060,10 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_INODE_IO, inode); + IO_TREE_BTREE_INODE_IO, inode); BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); - BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -2626,18 +2569,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), generation, level, NULL); - if (IS_ERR(tree_root->node) || - !extent_buffer_uptodate(tree_root->node)) { + if (IS_ERR(tree_root->node)) { handle_error = true; + ret = PTR_ERR(tree_root->node); + tree_root->node = NULL; + btrfs_warn(fs_info, "couldn't read tree root"); + continue; - if (IS_ERR(tree_root->node)) { - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - } else if (!extent_buffer_uptodate(tree_root->node)) { - ret = -EUCLEAN; - } - - btrfs_warn(fs_info, "failed to read tree root"); + } else if (!extent_buffer_uptodate(tree_root->node)) { + handle_error = true; + ret = -EIO; + btrfs_warn(fs_info, "error while reading tree root"); continue; } @@ -2753,7 +2695,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->check_integrity_print_mask = 0; #endif btrfs_init_balance(fs_info); - btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); + btrfs_init_async_reclaim_work(fs_info); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2928,7 +2870,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Verify the type first, if that or the the checksum value are + * Verify the type first, if that or the checksum value are * corrupted, we'll find out */ csum_type = btrfs_super_csum_type(disk_super); @@ -3482,8 +3424,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_CAST(page); super = page_address(page); - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { + if (btrfs_super_magic(super) != BTRFS_MAGIC) { + btrfs_release_disk_super(super); + return ERR_PTR(-ENODATA); + } + + if (btrfs_super_bytenr(super) != bytenr) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } @@ -4056,6 +4002,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4687,9 +4634,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } - -static const struct extent_io_ops btree_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btree_submit_bio_hook, - .readpage_end_io_hook = btree_readpage_end_io_hook, -}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 00dc39d47ed3..fee69ced58b4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,7 +76,11 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); - +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -123,9 +127,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 219a09a2b734..9800a8306368 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -40,6 +40,7 @@ struct io_failure_record; enum { IO_TREE_FS_PINNED_EXTENTS, IO_TREE_FS_EXCLUDED_EXTENTS, + IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, @@ -48,6 +49,7 @@ enum { IO_TREE_INODE_FILE_EXTENT, IO_TREE_LOG_CSUM_RANGE, IO_TREE_SELFTEST, + IO_TREE_DEVICE_ALLOC_STATE, }; struct extent_io_tree { @@ -61,7 +63,6 @@ struct extent_io_tree { u8 owner; spinlock_t lock; - const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 780b9c9a98fe..3b21fee13e77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1177,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + /* + * We're adding refs to a tree block we already own, this + * should not happen at all. + */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_crit(trans->fs_info, +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", + bytenr, num_bytes, root_objectid); + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + WARN_ON(1); + btrfs_crit(trans->fs_info, + "path->slots[0]=%d path->nodes[0]:", path->slots[0]); + btrfs_print_leaf(path->nodes[0]); + } + return -EUCLEAN; + } update_inline_extent_backref(path, iref, refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { @@ -1397,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* * __btrfs_inc_extent_ref - insert backreference for a given extent * + * The counterpart is in __btrfs_free_extent(), with examples and more details + * how it works. + * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for @@ -2849,11 +2867,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len -= to_add; } spin_unlock(&global_rsv->lock); - /* Add to any tickets we may have */ - if (len) - btrfs_try_granting_tickets(fs_info, - space_info); } + /* Add to any tickets we may have */ + if (!readonly && return_free_space && len) + btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } @@ -2935,6 +2952,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Drop one or more refs of @node. + * + * 1. Locate the extent refs. + * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. + * Locate it, then reduce the refs number or remove the ref line completely. + * + * 2. Update the refs count in EXTENT/METADATA_ITEM + * + * Inline backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 2 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * extent data backref root FS_TREE objectid 257 offset 0 count 1 + * + * This function gets called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 257 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 1 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * + * Keyed backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 754 gen 6 flags DATA + * [...] + * item 2 key (13631488 EXTENT_DATA_REF ) itemoff 3915 itemsize 28 + * extent data backref root FS_TREE objectid 866 offset 0 count 1 + * + * This function get called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 866 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 753 gen 6 flags DATA + * + * And that (13631488 EXTENT_DATA_REF ) gets removed. + */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -2967,7 +3043,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); + + if (!is_data && refs_to_drop != 1) { + btrfs_crit(info, +"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", + node->bytenr, refs_to_drop); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } if (is_data) skinny_metadata = false; @@ -2976,6 +3060,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, owner_objectid, owner_offset); if (ret == 0) { + /* + * Either the inline backref or the SHARED_DATA_REF/ + * SHARED_BLOCK_REF is found + * + * Here is a quick path to locate EXTENT/METADATA_ITEM. + * It's possible the EXTENT/METADATA_ITEM is near current slot. + */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -2992,13 +3083,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + + /* Quick path didn't find the EXTEMT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { - BUG_ON(iref); + if (iref) { + btrfs_crit(info, +"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } + /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, path, NULL, refs_to_drop, is_data, &last_ref); @@ -3009,6 +3108,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); path->leave_spinning = 1; + /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; @@ -3083,19 +3183,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + if (item_size < sizeof(*ei) + sizeof(*bi)) { + btrfs_crit(info, +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", + key.objectid, key.type, key.offset, + owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_err(info, - "trying to drop %d refs but we only have %Lu for bytenr %Lu", + btrfs_crit(info, + "trying to drop %d refs but we only have %llu for bytenr %llu", refs_to_drop, refs, bytenr); - ret = -EINVAL; - btrfs_abort_transaction(trans, ret); - goto out; + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; } refs -= refs_to_drop; @@ -3107,7 +3214,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - BUG_ON(!found_extent); + if (!found_extent) { + btrfs_crit(info, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); @@ -3122,13 +3234,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + /* In this branch refs == 1 */ if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(path, iref)); + if (is_data && refs_to_drop != + extent_data_ref_count(path, iref)) { + btrfs_crit(info, + "invalid refs_to_drop, current refs %u refs_to_drop %u", + extent_data_ref_count(path, iref), + refs_to_drop); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } if (iref) { - BUG_ON(path->slots[0] != extent_slot); + if (path->slots[0] != extent_slot) { + btrfs_crit(info, +"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", + key.objectid, key.type, + key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { - BUG_ON(path->slots[0] != extent_slot + 1); + /* + * No inline ref, we must be at SHARED_* item, + * And it's single ref, it must be: + * | extent_slot ||extent_slot + 1| + * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] + */ + if (path->slots[0] != extent_slot + 1) { + btrfs_crit(info, + "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } path->slots[0] = extent_slot; num_to_del = 2; } @@ -3169,6 +3307,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; +err_dump: + /* + * Leaf dump can take up a lot of log buffer, so we only do full leaf + * dump for debug build. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", + path->slots[0], extent_slot); + btrfs_print_leaf(path->nodes[0]); + } + + btrfs_free_path(path); + return -EUCLEAN; } /* @@ -3918,11 +4069,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ -static noinline int find_free_extent(struct btrfs_fs_info *fs_info, +static noinline int find_free_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 empty_size, u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; @@ -3954,7 +4106,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(fs_info, num_bytes, empty_size, flags); + trace_find_free_extent(root, num_bytes, empty_size, flags); space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { @@ -4203,7 +4355,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); @@ -4504,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level, u64 owner) + u64 bytenr, int level, u64 owner, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; @@ -4527,7 +4680,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, } btrfs_set_buffer_lockdep_class(owner, buf, level); - btrfs_tree_lock(buf); + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); @@ -4573,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; @@ -4589,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level, root_objectid); + level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -4606,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, - root_objectid); + root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a940edb1e64f..60f5f68d892d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -160,19 +160,20 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; bio->bi_private = NULL; - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags); + if (is_data_inode(tree->private_data)) + ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + bio_flags); else - btrfsic_submit_bio(bio); + ret = btrfs_submit_metadata_bio(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } @@ -280,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, { tree->fs_info = fs_info; tree->state = RB_ROOT; - tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; @@ -2819,8 +2819,6 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - bool data_inode = btrfs_ino(BTRFS_I(inode)) - != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2851,9 +2849,12 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - ret = tree->ops->readpage_end_io_hook(io_bio, offset, - page, start, end, - mirror); + if (is_data_inode(inode)) + ret = btrfs_verify_data_csum(io_bio, offset, page, + start, end, mirror); + else + ret = btrfs_validate_metadata_buffer(io_bio, + offset, page, start, end, mirror); if (ret) uptodate = 0; else @@ -2866,7 +2867,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (data_inode) { + if (is_data_inode(inode)) { /* * The generic bio_readpage_error handles errors the @@ -2881,7 +2882,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (!btrfs_submit_read_repair(inode, bio, offset, page, start - page_offset(page), start, end, mirror, - tree->ops->submit_bio_hook)) { + btrfs_submit_data_bio)) { uptodate = !bio->bi_status; offset += len; continue; @@ -3053,7 +3054,6 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - ASSERT(tree->ops); if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; @@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page) static struct extent_map * __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, get_extent_t *get_extent, - struct extent_map **em_cached) + u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3142,12 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct page *page, - get_extent_t *get_extent, - struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, unsigned int read_flags, - u64 *prev_em_start) +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -3209,7 +3205,7 @@ static int __do_readpage(struct page *page, break; } em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, get_extent, em_cached); + end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); unlock_extent(tree, cur, end); @@ -3241,7 +3237,7 @@ static int __do_readpage(struct page *page, /* * If we have a file range that points to a compressed extent - * and it's followed by a consecutive file range that points to + * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a @@ -3325,7 +3321,7 @@ static int __do_readpage(struct page *page, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, - end_bio_extent_readpage, mirror_num, + end_bio_extent_readpage, 0, *bio_flags, this_bio_flag, force_bio_submit); @@ -3362,44 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); + btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -static int __extent_read_full_page(struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, - unsigned int read_flags) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, - bio_flags, read_flags, NULL); - return ret; -} - -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = 0; - int ret; - - ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, - &bio_flags, 0); - if (bio) - ret = submit_one_bio(bio, mirror_num, bio_flags); - return ret; -} - static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { @@ -4552,7 +4516,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) * helper function for fiemap, which doesn't want to see any holes. * This maps until we find something past 'last' */ -static struct extent_map *get_extent_skip_holes(struct inode *inode, +static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, u64 offset, u64 last) { u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -4567,7 +4531,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); + em = btrfs_get_extent_fiemap(inode, offset, len); if (IS_ERR_OR_NULL(em)) return em; @@ -4696,7 +4660,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; @@ -4707,12 +4671,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 last; u64 last_for_get_extent = 0; u64 disko = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; struct ulist *roots; struct ulist *tmp_ulist; @@ -4743,8 +4707,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { goto out_free_ulist; } else { @@ -4758,7 +4722,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, found_type = found_key.type; /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (found_key.objectid != btrfs_ino(inode) || found_type != BTRFS_EXTENT_DATA_KEY) { /* have to trust i_size as the end */ last = (u64)-1; @@ -4784,7 +4748,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, + lock_extent_bits(&inode->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent); @@ -4853,8 +4817,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * then we're just getting a count and we can skip the * lookup stuff. */ - ret = btrfs_check_shared(root, - btrfs_ino(BTRFS_I(inode)), + ret = btrfs_check_shared(root, btrfs_ino(inode), bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; @@ -4898,7 +4861,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, + unlock_extent_cached(&inode->io_tree, start, start + len - 1, &cached_state); out_free_ulist: @@ -4990,7 +4953,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); eb->blocking_writers = 0; - eb->lock_nested = false; + eb->lock_recursed = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); @@ -5574,20 +5537,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(page, - btree_get_extent, &bio, - mirror_num, &bio_flags, - REQ_META); + err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + page, page_offset(page), PAGE_SIZE, 0, + &bio, end_bio_extent_readpage, + mirror_num, 0, 0, false); if (err) { - ret = err; /* - * We use &bio in above __extent_read_full_page, - * so we ensure that if it returns error, the - * current page fails to add itself to bio and - * it's been unlocked. - * - * We must dec io_pages by ourselves. + * We failed to submit the bio so it's the + * caller's responsibility to perform cleanup + * i.e unlock page/set error bit. */ + ret = err; + SetPageError(page); + unlock_page(page); atomic_dec(&eb->io_pages); } } else { @@ -5622,6 +5584,36 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) return ret; } +static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + btrfs_warn(eb->fs_info, + "access to eb bytenr %llu len %lu out of range start %lu len %lu", + eb->start, eb->len, start, len); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + + return true; +} + +/* + * Check if the [start, start + len) range is valid before reading/writing + * the eb. + * NOTE: @start and @len are offset inside the eb, not logical address. + * + * Caller should not touch the dst/src memory if this function returns error. + */ +static inline int check_eb_range(const struct extent_buffer *eb, + unsigned long start, unsigned long len) +{ + unsigned long offset; + + /* start, start + len should not go beyond eb->len nor overflow */ + if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) + return report_eb_range(eb, start, len); + + return false; +} + void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { @@ -5632,12 +5624,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = start >> PAGE_SHIFT; - if (start + len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, len); - memset(dst, 0, len); + if (check_eb_range(eb, start, len)) return; - } offset = offset_in_page(start); @@ -5702,8 +5690,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long i = start >> PAGE_SHIFT; int ret = 0; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return -EINVAL; offset = offset_in_page(start); @@ -5756,8 +5744,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5785,8 +5773,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, char *kaddr; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5830,6 +5818,10 @@ void copy_extent_buffer(const struct extent_buffer *dst, char *kaddr; unsigned long i = dst_offset >> PAGE_SHIFT; + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(src, src_offset, len)) + return; + WARN_ON(src->len != dst_len); offset = offset_in_page(dst_offset); @@ -6019,25 +6011,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu dst len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu dst len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; while (len > 0) { dst_off_in_page = offset_in_page(dst_offset); @@ -6064,7 +6046,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; @@ -6073,18 +6054,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 30794ae58498..f39d02e7f7ef 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -74,18 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); -struct extent_io_ops { - /* - * The following callbacks must be always defined, the function - * pointer will be called unconditionally. - */ - submit_bio_hook_t *submit_bio_hook; - int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror); -}; - - #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) struct extent_buffer { @@ -102,7 +90,7 @@ struct extent_buffer { int blocking_writers; atomic_t blocking_readers; - bool lock_nested; + bool lock_recursed; /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; @@ -193,8 +181,11 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num); +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags); +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); @@ -203,7 +194,7 @@ int extent_writepages(struct address_space *mapping, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void extent_readahead(struct readahead_control *rac); -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7d5ec71615b8..8f4f2bd6d9b9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; - count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - csum, nblocks); + count = btrfs_find_ordered_sum(BTRFS_I(inode), offset, + disk_bytenr, csum, nblocks); if (count) goto found; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4507c3d09399..0ff659455b1e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1057,11 +1057,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &extent_item_size, - extent_item_size, - sizeof(struct btrfs_item) + - extent_item_size, 1); + setup_items_for_insert(root, path, &key, &extent_item_size, 1); *key_inserted = 1; } @@ -1477,9 +1473,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); - last_pos = start_pos - + round_up(pos + write_bytes - start_pos, - fs_info->sectorsize) - 1; + last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; @@ -1497,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(&inode->vfs_inode, - ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); return -EAGAIN; } @@ -1872,7 +1865,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from); + written = btrfs_direct_IO(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -2025,7 +2018,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { + /* + * 1. We must always clear IOCB_DSYNC in order to not deadlock + * in iomap, as it calls generic_write_sync() in this case. + * 2. If we are async, we can call iomap_dio_complete() either + * in + * + * 2.1. A worker thread from the last bio completed. In this + * case we need to mark the btrfs_dio_data that it is + * async in order to call generic_write_sync() properly. + * This is handled by setting BTRFS_DIO_SYNC_STUB in the + * current->journal_info. + * 2.2 The submitter context, because all IO completed + * before we exited iomap_dio_rw(). In this case we can + * just re-set the IOCB_DSYNC on the iocb and we'll do + * the sync below. If our ->end_io() gets called and + * current->journal_info is set, then we know we're in + * our current context and we will clear + * current->journal_info to indicate that we need to + * sync below. + */ + if (sync) { + ASSERT(current->journal_info == NULL); + iocb->ki_flags &= ~IOCB_DSYNC; + current->journal_info = BTRFS_DIO_SYNC_STUB; + } num_written = __btrfs_direct_write(iocb, from); + + /* + * As stated above, we cleared journal_info, so we need to do + * the sync ourselves. + */ + if (sync && current->journal_info == NULL) + iocb->ki_flags |= IOCB_DSYNC; + current->journal_info = NULL; } else { num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) @@ -2065,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by setattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. + * Set by setattr when we are about to truncate a file from a non-zero + * size to a zero size. This tries to flush down new bytes that may + * have been written if the application were using truncate to replace + * a file in place. */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); return 0; @@ -2116,20 +2142,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; + u64 len; + bool full_sync; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); /* - * Set the range to full if the NO_HOLES feature is not enabled. - * This is to avoid missing file extent items representing holes after - * replaying the log. + * Always set the range to a full range, otherwise we can get into + * several problems, from missing file extent items to represent holes + * when not using the NO_HOLES feature, to log tree corruption due to + * races between hole detection during logging and completion of ordered + * extents outside the range, to missing checksums due to ordered extents + * for which we flushed only a subset of their pages. */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { - start = 0; - end = LLONG_MAX; - } + start = 0; + end = LLONG_MAX; + len = (u64)LLONG_MAX + 1; /* * We write the dirty pages in the range and wait until they complete @@ -2153,19 +2183,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges and races between logging and - * completion of ordered extents for adjancent ranges - both races - * could lead to file extent items in the log with overlapping ranges. - * Do this while holding the inode lock, to avoid races with other - * tasks. + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); /* * Before we acquired the inode's lock, someone may have dirtied more @@ -2196,20 +2219,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * - * Also, the range length can be represented by u64, we have to do the - * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. + * For a full fsync we wait for the ordered extents to complete while + * for a fast fsync we wait just for writeback to complete, and then + * attach the ordered extents to the transaction so that a transaction + * commit waits for their completion, to avoid data loss if we fsync, + * the current transaction commits before the ordered extents complete + * and a power failure happens right after that. */ - ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); - if (ret) { - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + } else { + /* + * Get our ordered extents as soon as possible to avoid doing + * checksum lookups in the csum tree, and use instead the + * checksums attached to the ordered extents. + */ + btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), + &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->i_mapping, start, end); } + + if (ret) + goto out_release_extents; + atomic_inc(&root->log_batch); + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ smp_mb(); if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) { + (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && + (full_sync || list_empty(&ctx.ordered_extents)))) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2225,9 +2270,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } /* @@ -2244,12 +2287,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } - ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); + ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2276,6 +2318,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + } ret = btrfs_commit_transaction(trans); } else { ret = btrfs_end_transaction(trans); @@ -2286,6 +2335,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!ret) ret = err; return ret > 0 ? -EIO : ret; + +out_release_extents: + btrfs_release_log_ctx_extents(&ctx); + up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; } static const struct vm_operations_struct btrfs_file_vm_ops = { @@ -2481,7 +2536,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + lockend); /* * We need to make sure we have no ordered extents in this range @@ -2509,11 +2565,11 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } -static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, +static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, - struct btrfs_clone_extent_info *clone_info, - const u64 clone_len) + struct btrfs_replace_extent_info *extent_info, + const u64 replace_len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2522,51 +2578,69 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; int slot; struct btrfs_ref ref = { 0 }; - u64 ref_offset; int ret; - if (clone_len == 0) + if (replace_len == 0) return 0; - if (clone_info->disk_offset == 0 && + if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = clone_info->file_offset; + key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, clone_info->extent_buf, + write_extent_buffer(leaf, extent_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); - btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); + if (extent_info->is_new_extent) + btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), - clone_info->file_offset, clone_len); + extent_info->file_offset, replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ - if (clone_info->disk_offset == 0) + if (extent_info->disk_offset == 0) return 0; - inode_add_bytes(inode, clone_len); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - clone_info->disk_offset, - clone_info->disk_len, 0); - ref_offset = clone_info->file_offset - clone_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), ref_offset); - ret = btrfs_inc_extent_ref(trans, &ref); + inode_add_bytes(inode, replace_len); + + if (extent_info->is_new_extent && extent_info->insertions == 0) { + key.objectid = extent_info->disk_offset; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = extent_info->disk_len; + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + extent_info->file_offset, + extent_info->qgroup_reserved, + &key); + } else { + u64 ref_offset; + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + extent_info->disk_offset, + extent_info->disk_len, 0); + ref_offset = extent_info->file_offset - extent_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + } + + extent_info->insertions++; return ret; } @@ -2574,15 +2648,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, /* * The respective range must have been previously locked, as well as the inode. * The end offset is inclusive (last byte of the range). - * @clone_info is NULL for fallocate's hole punching and non-NULL for extent - * cloning. - * When cloning, we don't want to end up in a state where we dropped extents - * without inserting a new one, so we must abort the transaction to avoid a - * corruption. + * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing + * the file range with an extent. + * When not punching a hole, we don't want to end up in a state where we dropped + * extents without inserting a new one, so we must abort the transaction to avoid + * a corruption. */ -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2611,10 +2685,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set or if we are cloning - * an extent + * 1 - adding the hole extent if no_holes isn't set or if we are + * replacing the range with a new extent */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) rsv_count = 3; else rsv_count = 2; @@ -2644,14 +2718,15 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * returned by __btrfs_drop_extents() without having * changed anything in the file. */ - if (clone_info && ret && ret != -EOPNOTSUPP) + if (extent_info && !extent_info->is_new_extent && + ret && ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); break; } trans->block_rsv = &fs_info->trans_block_rsv; - if (!clone_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); @@ -2665,7 +2740,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2685,18 +2760,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info && drop_end > clone_info->file_offset) { - u64 clone_len = drop_end - clone_info->file_offset; + if (extent_info && drop_end > extent_info->file_offset) { + u64 replace_len = drop_end - extent_info->file_offset; - ret = btrfs_insert_clone_extent(trans, inode, path, - clone_info, clone_len); + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, replace_len); if (ret) { btrfs_abort_transaction(trans, ret); break; } - clone_info->data_len -= clone_len; - clone_info->data_offset += clone_len; - clone_info->file_offset += clone_len; + extent_info->data_len -= replace_len; + extent_info->data_offset += replace_len; + extent_info->file_offset += replace_len; } cur_offset = drop_end; @@ -2720,7 +2795,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!clone_info) { + if (!extent_info) { ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; @@ -2739,7 +2814,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * than 16Mb would force the full fsync any way (when * try_release_extent_mapping() is invoked during page cache truncation. */ - if (clone_info) + if (extent_info && !extent_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -2765,7 +2840,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); if (ret) { @@ -2773,7 +2848,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), cur_offset, drop_end - cur_offset); @@ -2783,9 +2858,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, - clone_info->data_len); + if (extent_info) { + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, + extent_info->data_len); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2840,9 +2915,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset, btrfs_inode_sectorsize(inode)); + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, - btrfs_inode_sectorsize(inode)) - 1; + btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -2927,7 +3002,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, &trans); btrfs_free_path(path); if (ret) @@ -3044,7 +3119,7 @@ enum { RANGE_BOUNDARY_HOLE, }; -static int btrfs_zero_range_check_range_boundary(struct inode *inode, +static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { const u64 sectorsize = btrfs_inode_sectorsize(inode); @@ -3052,7 +3127,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -3077,7 +3152,7 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -3167,7 +3242,8 @@ static int btrfs_zero_range(struct inode *inode, * to cover them. */ if (!IS_ALIGNED(offset, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, offset); + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), + offset); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { @@ -3183,7 +3259,7 @@ static int btrfs_zero_range(struct inode *inode, } if (!IS_ALIGNED(offset + len, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset + len); if (ret < 0) goto out; @@ -3258,7 +3334,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 locked_end; u64 actual_end = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(inode); + int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; alloc_start = round_down(offset, blocksize); @@ -3340,7 +3416,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + locked_end); if (ordered && ordered->file_offset + ordered->num_bytes > alloc_start && @@ -3541,9 +3618,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct inode *inode = file_inode(iocb->ki_filp); + + inode_lock_shared(inode); + ret = btrfs_direct_IO(iocb, to); + inode_unlock_shared(inode); + if (ret < 0) + return ret; + } + + return generic_file_buffered_read(iocb, to, ret); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = btrfs_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index dc82fd0c80cb..af0013d3df63 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *val; - io_ctl_map_page(io_ctl, 1); /* @@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - val = io_ctl->cur; - *val = cpu_to_le64(generation); + put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *gen; + u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit @@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", - *gen, generation); + cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, return -ENOSPC; entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); @@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, } e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); @@ -1353,7 +1350,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9570458aa847..936c3137c646 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include "misc.h" #include "ctree.h" @@ -59,9 +59,10 @@ struct btrfs_iget_args { struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; + bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -70,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; -static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -140,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, static int btrfs_dirty_inode(struct inode *inode); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode) -{ - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -} -#endif - static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -2183,9 +2176,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2245,16 +2237,15 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ -static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, struct list_head *list) +static int add_pending_csums(struct btrfs_trans_handle *trans, + struct list_head *list) { struct btrfs_ordered_sum *sum; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, - BTRFS_I(inode)->root->fs_info->csum_root, sum); + ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2357,7 +2348,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -2548,7 +2539,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; @@ -2568,8 +2558,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ - return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset, - &stack_fi, oe->qgroup_rsv); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + oe->file_offset, &stack_fi, + oe->qgroup_rsv); } /* @@ -2666,8 +2657,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_ordered_extent_file_extent(trans, inode, - ordered_extent); + ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, @@ -2683,7 +2673,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2752,7 +2742,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(inode, ordered_extent); + btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -2772,8 +2762,8 @@ static void finish_ordered_fn(struct btrfs_work *work) void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; @@ -2784,7 +2774,7 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; else wq = fs_info->endio_write_workers; @@ -2833,9 +2823,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; @@ -3055,7 +3044,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; - struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; /* @@ -3395,7 +3383,6 @@ static int btrfs_read_locked_inode(struct inode *inode, switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -4051,7 +4038,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (err) { @@ -4583,7 +4570,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -4848,19 +4835,16 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. + * zero. Make sure any new writes to the file get on disk + * on close. */ if (newsize == 0) - set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { @@ -5305,15 +5289,15 @@ static void inode_tree_add(struct inode *inode) spin_unlock(&root->inode_lock); } -static void inode_tree_del(struct inode *inode) +static void inode_tree_del(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + if (!RB_EMPTY_NODE(&inode->rb_node)) { + rb_erase(&inode->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); @@ -6311,7 +6295,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; d_instantiate_new(dentry, inode); out_unlock: @@ -6374,7 +6357,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - int ret; err = btrfs_update_inode(trans, root, inode); if (err) @@ -6389,12 +6371,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, - true, NULL); - if (ret == BTRFS_NEED_TRANS_COMMIT) { - err = btrfs_commit_transaction(trans); - trans = NULL; - } + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } fail: @@ -6540,8 +6517,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - int err = 0; + int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6569,7 +6545,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, } em = alloc_extent_map(); if (!em) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; @@ -6579,7 +6555,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -6592,14 +6568,16 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, */ path->leave_spinning = 1; + path->recurse = btrfs_is_free_space_inode(inode); + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { - err = ret; goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; + ret = 0; } leaf = path->nodes[0]; @@ -6625,7 +6603,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); @@ -6643,12 +6621,11 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } else if (ret > 0) { + else if (ret > 0) goto not_found; - } + leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -6699,10 +6676,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, extent_offset, item); - if (ret) { - err = ret; + if (ret) goto out; - } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6726,29 +6701,28 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, em->len = len; em->block_start = EXTENT_MAP_HOLE; insert: + ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); - err = -EIO; + ret = -EIO; goto out; } - err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - if (err) { + if (ret) { free_extent_map(em); - return ERR_PTR(err); + return ERR_PTR(ret); } - BUG_ON(!em); /* Error is always set */ return em; } @@ -7111,7 +7085,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7160,7 +7134,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -7249,30 +7223,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, } -static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - return 0; -} - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7333,7 +7284,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { @@ -7344,64 +7294,88 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; + bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); - if (!create) + /* + * We used current->journal_info here to see if we were sync, but + * there's a lot of tests in the enospc machinery to not do flushing if + * we have a journal_info set, so we need to clear this out and re-set + * it in iomap_end. + */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; + + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; } + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->sync = sync; + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } + } + iomap->private = dio_data; + + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } @@ -7433,36 +7407,48 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; } + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; + free_extent_map(em); return 0; @@ -7471,8 +7457,63 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, start, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(BTRFS_I(inode), pos, + length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } + + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } +out: + /* + * We're all done, we can re-set the current->journal_info now safely + * for our endio. + */ + if (dio_data->sync) { + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_DIO_SYNC_STUB; + } + kfree(dio_data); + iomap->private = NULL; + return ret; } @@ -7496,7 +7537,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) dip->logical_offset + dip->bytes - 1); } - dio_end_io(dip->dio_bio); + bio_endio(dip->dio_bio); kfree(dip); } @@ -7730,24 +7771,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); - - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - /* - * Setting range start and end to the same value means that - * no cleanup will happen in btrfs_direct_IO - */ - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } return dip; } -static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) +static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); @@ -7764,6 +7792,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, int ret; blk_status_t status; struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7772,8 +7801,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; - dio_end_io(dio_bio); - return; + bio_endio(dio_bio); + return BLK_QC_T_NONE; } if (!write && csum) { @@ -7844,15 +7873,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; } while (submit_len > 0); - return; + return BLK_QC_T_NONE; out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -7888,37 +7919,59 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned flags) +{ + /* + * Now if we're still in the context of our submitter we know we can't + * safely run generic_write_sync(), so clear our flag here so that the + * caller knows to follow up with a sync. + */ + if (current->journal_info == BTRFS_DIO_SYNC_STUB) { + current->journal_info = NULL; + return error; + } + + if (error) + return error; + + if (size) { + iocb->ki_flags |= IOCB_DSYNC; + return generic_write_sync(iocb, size); + } + + return 0; +} + +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_submit_direct, +}; + +static const struct iomap_dio_ops btrfs_sync_dops = { + .submit_io = btrfs_submit_direct, + .end_io = btrfs_maybe_fsync_end_io, +}; + +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; - int flags = 0; - bool wakeup = true; bool relock = false; ssize_t ret; if (check_direct_IO(fs_info, iter, offset)) return 0; - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update @@ -7926,66 +7979,29 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; } - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { + /* + * We have are actually a sync iocb, so we need our fancy endio to know + * if we need to sync. + */ + if (current->journal_info) + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); + + if (ret == -ENOTBLK) + ret = 0; + + if (iov_iter_rw(iter) == WRITE) up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, offset, dio_data.reserve, - true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(BTRFS_I(inode), - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } -out: - if (wakeup) - inode_dio_end(inode); + if (relock) inode_lock(inode); @@ -8002,12 +8018,24 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len); + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page, btrfs_get_extent, 0); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + unsigned long bio_flags = 0; + struct bio *bio = NULL; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); + if (bio) + ret = submit_one_bio(bio, 0, bio_flags); + return ret; } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) @@ -8091,15 +8119,15 @@ static int btrfs_migratepage(struct address_space *mapping, static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct inode *inode = page->mapping->host; - struct extent_io_tree *tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; - int inode_evicting = inode->i_state & I_FREEING; + int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -8110,7 +8138,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, */ wait_on_page_writeback(page); - tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; @@ -8120,8 +8147,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, lock_extent_bits(tree, page_start, page_end, &cached_state); again: start = page_start; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - page_end - start + 1); + ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); @@ -8142,7 +8168,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct btrfs_ordered_inode_tree *tree; u64 new_len; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8181,7 +8207,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * bit of its io_tree, and free the qgroup reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | @@ -8283,7 +8309,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -8614,21 +8640,21 @@ void btrfs_free_inode(struct inode *inode) kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -void btrfs_destroy_inode(struct inode *inode) +void btrfs_destroy_inode(struct inode *vfs_inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; - WARN_ON(!hlist_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - WARN_ON(BTRFS_I(inode)->block_rsv.reserved); - WARN_ON(BTRFS_I(inode)->block_rsv.size); - WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->delalloc_bytes); - WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); - WARN_ON(BTRFS_I(inode)->csum_bytes); - WARN_ON(BTRFS_I(inode)->defrag_bytes); + WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); + WARN_ON(vfs_inode->i_data.nrpages); + WARN_ON(inode->block_rsv.reserved); + WARN_ON(inode->block_rsv.size); + WARN_ON(inode->outstanding_extents); + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8643,7 +8669,7 @@ void btrfs_destroy_inode(struct inode *inode) if (!ordered) break; else { - btrfs_err(fs_info, + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); @@ -8651,11 +8677,11 @@ void btrfs_destroy_inode(struct inode *inode) btrfs_put_ordered_extent(ordered); } } - btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); + btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); - btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); - btrfs_put_root(BTRFS_I(inode)->root); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); + btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) @@ -8780,27 +8806,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); - struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; - struct btrfs_log_ctx ctx_root; - struct btrfs_log_ctx ctx_dest; - bool sync_log_root = false; - bool sync_log_dest = false; - bool commit_transaction = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; - btrfs_init_log_ctx(&ctx_root, old_inode); - btrfs_init_log_ctx(&ctx_dest, new_inode); - /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -8942,30 +8960,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { - parent = new_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx_root); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_root = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - if (!commit_transaction) { - parent = old_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), - BTRFS_I(new_dir), parent, - false, &ctx_dest); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_dest = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; - } + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -8998,46 +9000,13 @@ static int btrfs_rename_exchange(struct inode *old_dir, dest_log_pinned = false; } } - if (!ret && sync_log_root && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, - &ctx_root); - if (ret) - commit_transaction = true; - } - if (!ret && sync_log_dest && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, - &ctx_dest); - if (ret) - commit_transaction = true; - } - if (commit_transaction) { - /* - * We may have set commit_transaction when logging the new name - * in the destination root, in which case we left the source - * root context in the list of log contextes. So make sure we - * remove it to avoid invalid memory accesses, since the context - * was allocated in our stack frame. - */ - if (sync_log_root) { - mutex_lock(&root->log_mutex); - list_del_init(&ctx_root.list); - mutex_unlock(&root->log_mutex); - } - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - ASSERT(list_empty(&ctx_root.list)); - ASSERT(list_empty(&ctx_dest.list)); - return ret; } @@ -9105,11 +9074,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; + int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - struct btrfs_log_ctx ctx; - bool sync_log = false; - bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9259,17 +9226,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { - struct dentry *parent = new_dentry->d_parent; - - btrfs_init_log_ctx(&ctx, old_inode); - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -9306,23 +9264,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_end_log_trans(root); log_pinned = false; } - if (!ret && sync_log) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); - if (ret) - commit_transaction = true; - } else if (sync_log) { - mutex_lock(&root->log_mutex); - list_del(&ctx.list); - mutex_unlock(&root->log_mutex); - } - if (commit_transaction) { - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); @@ -9388,7 +9331,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9428,9 +9371,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); - ret++; - if (nr != -1 && ret >= nr) - goto out; + if (*nr != U64_MAX) { + (*nr)--; + if (*nr == 0) + goto out; + } cond_resched(); spin_lock(&root->delalloc_lock); } @@ -9455,18 +9400,15 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + u64 nr = U64_MAX; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1, true); - if (ret > 0) - ret = 0; - return ret; + return start_delalloc_inodes(root, &nr, true); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) { struct btrfs_root *root; struct list_head splice; @@ -9489,15 +9431,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr, false); + ret = start_delalloc_inodes(root, &nr, false); btrfs_put_root(root); if (ret < 0) goto out; - - if (nr != -1) { - nr -= ret; - WARN_ON(nr < 0); - } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); @@ -9568,7 +9505,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -9633,11 +9569,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, +static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_trans_handle *trans_in, struct inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; + struct btrfs_replace_extent_info extent_info; + struct btrfs_trans_handle *trans = trans_in; + struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; int ret; @@ -9654,10 +9594,40 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); if (ret < 0) - return ret; - return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset, - &stack_fi, ret); + return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, BTRFS_I(inode), + file_offset, &stack_fi, ret); + if (ret) + return ERR_PTR(ret); + return trans; + } + + extent_info.disk_offset = start; + extent_info.disk_len = len; + extent_info.data_offset = 0; + extent_info.data_len = len; + extent_info.file_offset = file_offset; + extent_info.extent_buf = (char *)&stack_fi; + extent_info.is_new_extent = true; + extent_info.qgroup_reserved = ret; + extent_info.insertions = 0; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_replace_file_extents(inode, path, file_offset, + file_offset + len - 1, &extent_info, + &trans); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + return trans; } + static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -9680,14 +9650,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (trans) own_trans = false; while (num_bytes > 0) { - if (own_trans) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - } - cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* @@ -9699,11 +9661,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); - if (ret) { - if (own_trans) - btrfs_end_transaction(trans); + if (ret) break; - } /* * We've reserved this space, and thus converted it from @@ -9716,13 +9675,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); - if (ret) { + trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, ret); - if (own_trans) - btrfs_end_transaction(trans); break; } @@ -9785,8 +9742,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } - if (own_trans) + if (own_trans) { btrfs_end_transaction(trans); + trans = NULL; + } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, @@ -9865,7 +9824,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) @@ -10072,14 +10030,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, /* * Balance or device remove/replace/resize can move stuff around from - * under us. The EXCL_OP flag makes sure they aren't running/won't run - * concurrently while we are mapping the swap extents, and - * fs_info->swapfile_pins prevents them from running while the swap file - * is active and moving the extents. Note that this also prevents a - * concurrent device add which isn't actually necessary, but it's not + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; @@ -10225,7 +10183,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) btrfs_swap_deactivate(file); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (ret) return ret; @@ -10283,12 +10241,6 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static const struct extent_io_ops btrfs_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = btrfs_readpage_end_io_hook, -}; - /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume @@ -10306,7 +10258,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = btrfs_direct_IO, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2d9109d9e98f..ab408a23ba32 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,6 +378,18 @@ static int check_xflags(unsigned int flags) return 0; } +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); +} + /* * Set the xflags from the internal inode flags. The remaining items of fsxattr * are zeroed. @@ -618,7 +630,7 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); goto fail_free; } trans->block_rsv = &block_rsv; @@ -628,7 +640,8 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -742,7 +755,7 @@ static noinline int create_subvol(struct inode *dir, kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); err = btrfs_commit_transaction(trans); if (err && !ret) @@ -856,7 +869,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); + btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); @@ -1306,7 +1319,7 @@ static int cluster_pages_for_defrag(struct inode *inode, break; unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); /* @@ -1638,7 +1651,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1752,7 +1765,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); mnt_drop_write_file(file); return ret; } @@ -3126,7 +3139,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; vol_args = memdup_user(arg, sizeof(*vol_args)); @@ -3143,7 +3156,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -3172,7 +3185,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3183,7 +3196,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -3214,7 +3227,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -3232,7 +3245,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3462,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *tmp; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3518,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, break; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3736,11 +3743,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -3951,7 +3958,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -3997,7 +4004,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } locked: - BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4052,10 +4058,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to - * btrfs_balance. bctl is freed in reset_balance_state, or, if - * restriper was paused all the way until unmount, in free_fs_info. - * The flag should be cleared after reset_balance_state. + * Ownership of bctl and exclusive operation goes to btrfs_balance. + * bctl is freed in reset_balance_state, or, if restriper was paused + * all the way until unmount, in free_fs_info. The flag should be + * cleared after reset_balance_state. */ need_unlock = false; @@ -4074,7 +4080,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); return ret; @@ -4897,7 +4903,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f75612e18a82..66e02ebdd340 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ * performance reasons. * * - * Lock nesting - * ------------ + * Lock recursion + * -------------- * * A write operation on a tree might indirectly start a look up on the same * tree. This can happen when btrfs_cow_block locks the tree and needs to @@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; btrfs_assert_tree_read_locked(eb); atomic_inc(&eb->blocking_readers); @@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); @@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * * The rwlock is held upon exit. */ -void btrfs_tree_read_lock(struct extent_buffer *eb) +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse) { u64 start_ns = 0; @@ -263,8 +264,9 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) * depends on this as it may be called on a partly * (write-)locked tree. */ - BUG_ON(eb->lock_nested); - eb->lock_nested = true; + WARN_ON(!recurse); + BUG_ON(eb->lock_recursed); + eb->lock_recursed = true; read_unlock(&eb->lock); trace_btrfs_tree_read_lock(eb, start_ns); return; @@ -279,6 +281,11 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) trace_btrfs_tree_read_lock(eb, start_ns); } +void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); +} + /* * Lock extent buffer for read, optimistically expecting that there are no * contending blocking writers. If there are, don't wait. @@ -362,11 +369,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -388,11 +395,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -409,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * * The rwlock is held for write upon exit. */ -void btrfs_tree_lock(struct extent_buffer *eb) +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -434,6 +441,11 @@ void btrfs_tree_lock(struct extent_buffer *eb) trace_btrfs_tree_lock(eb, start_ns); } +void btrfs_tree_lock(struct extent_buffer *eb) +{ + __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); +} + /* * Release the write lock, either blocking or spinning (ie. there's no need * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). @@ -552,13 +564,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * * Return: root extent buffer with read lock held */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index d715846c10b8..3ea81ed3320b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -16,11 +16,81 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +/* + * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at + * the time of this patch is 8, which is how many we use. Keep this in mind if + * you decide you want to add another subclass. + */ +enum btrfs_lock_nesting { + BTRFS_NESTING_NORMAL, + + /* + * When we COW a block we are holding the lock on the original block, + * and since our lockdep maps are rootid+level, this confuses lockdep + * when we lock the newly allocated COW'd block. Handle this by having + * a subclass for COW'ed blocks so that lockdep doesn't complain. + */ + BTRFS_NESTING_COW, + + /* + * Oftentimes we need to lock adjacent nodes on the same level while + * still holding the lock on the original node we searched to, such as + * for searching forward or for split/balance. + * + * Because of this we need to indicate to lockdep that this is + * acceptable by having a different subclass for each of these + * operations. + */ + BTRFS_NESTING_LEFT, + BTRFS_NESTING_RIGHT, + + /* + * When splitting we will be holding a lock on the left/right node when + * we need to cow that node, thus we need a new set of subclasses for + * these two operations. + */ + BTRFS_NESTING_LEFT_COW, + BTRFS_NESTING_RIGHT_COW, + + /* + * When splitting we may push nodes to the left or right, but still use + * the subsequent nodes in our path, keeping our locks on those adjacent + * blocks. Thus when we go to allocate a new split block we've already + * used up all of our available subclasses, so this subclass exists to + * handle this case where we need to allocate a new split block. + */ + BTRFS_NESTING_SPLIT, + + /* + * When promoting a new block to a root we need to have a special + * subclass so we don't confuse lockdep, as it will appear that we are + * locking a higher level node before a lower level one. Copying also + * has this problem as it appears we're locking the same block again + * when we make a snapshot of an existing root. + */ + BTRFS_NESTING_NEW_ROOT, + + /* + * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * add this in here and add a static_assert to keep us from going over + * the limit. As of this writing we're limited to 8, and we're + * definitely using 8, hence this check to keep us from messing up in + * the future. + */ + BTRFS_NESTING_MAX, +}; + +static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, + "too many lock subclasses defined"); + struct btrfs_path; +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); @@ -29,6 +99,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse); + +static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + return __btrfs_read_lock_root_node(root, false); +} #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ebac13389e7e..87bac9ecdf4c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,11 +212,12 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry); + trace_btrfs_ordered_extent_add(inode, entry); spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, @@ -377,17 +378,16 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used * to make sure this function only returns 1 once for a given ordered extent. */ -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; int ret; - tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { entry = *cached; @@ -408,7 +408,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, } if (io_size > entry->bytes_left) { - btrfs_crit(BTRFS_I(inode)->root->fs_info, + btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } @@ -441,10 +441,11 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); + trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); @@ -462,14 +463,14 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * remove an ordered extent from the tree. No references are dropped * and waiters are woken up. */ -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; + bool pending; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -491,13 +492,41 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (pending) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; - trace_btrfs_ordered_extent_remove(inode, entry); + trace_btrfs_ordered_extent_remove(btrfs_inode, entry); if (!root->nr_ordered_extents) { spin_lock(&fs_info->ordered_root_lock); @@ -514,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered->inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); complete(&ordered->completion); } @@ -620,12 +649,11 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, * in the extent, and it waits on the io completion code to insert * metadata into the btree corresponding to the extent */ -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, - int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; + struct btrfs_inode *inode = BTRFS_I(entry->inode); trace_btrfs_ordered_extent_start(inode, entry); @@ -635,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * for the flusher thread to find them */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->i_mapping, start, end); + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -679,7 +707,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -690,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -774,18 +802,46 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( return entry; } +/* + * Adds all ordered extents to the given list. The list ends up sorted by the + * file_offset of the ordered extents. + */ +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *n; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + struct btrfs_ordered_extent *ordered; + + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + + ASSERT(list_empty(&ordered->log_list)); + list_add_tail(&ordered->log_list, list); + refcount_inc(&ordered->refs); + } + spin_unlock_irq(&tree->lock); +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found */ struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) @@ -803,20 +859,21 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) * try to find a checksum. This is used because we allow pages to * be reclaimed before their checksum is actually put into the btree */ -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len) +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; u32 sectorsize = btrfs_inode_sectorsize(inode); + const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset); + ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) return 0; @@ -824,10 +881,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr && disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> - inode->i_sb->s_blocksize_bits; - num_sectors = ordered_sum->len >> - inode->i_sb->s_blocksize_bits; + i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits; + num_sectors = ordered_sum->len >> blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); memcpy(sum + index, ordered_sum->sums + i * csum_size, num_sectors * csum_size); @@ -883,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent_cached(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d61ea9c880a3..c3a2325e64a4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -56,6 +56,12 @@ enum { BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ BTRFS_ORDERED_REGULAR, + /* Used during fsync to track already logged extents */ + BTRFS_ORDERED_LOGGED, + /* We have already logged all the csums of the ordered extent */ + BTRFS_ORDERED_LOGGED_CSUM, + /* We wait for this extent to complete in the current transaction */ + BTRFS_ORDERED_PENDING, }; struct btrfs_ordered_extent { @@ -104,6 +110,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* used for fast fsyncs */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -142,9 +151,9 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) } void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, @@ -165,17 +174,18 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len); +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list); +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 80567c11ec12..7695c4783d33 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -7,6 +7,44 @@ #include "disk-io.h" #include "print-tree.h" +struct root_name_map { + u64 id; + char name[16]; +}; + +static const struct root_name_map root_map[] = { + { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, + { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, + { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, + { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, + { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, + { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, + { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, + { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, + { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, + { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, +}; + +const char *btrfs_root_name(u64 objectid, char *buf) +{ + int i; + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) { + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, + "TREE_RELOC offset=%llu", objectid); + return buf; + } + + for (i = 0; i < ARRAY_SIZE(root_map); i++) { + if (root_map[i].id == objectid) + return root_map[i].name; + } + + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid); + return buf; +} + static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index e6bb38fd75ad..78b99385a503 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,7 +6,11 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +/* Buffer size to contain tree name and possibly additional data (offset) */ +#define BTRFS_ROOT_NAME_BUF_LEN 48 + void btrfs_print_leaf(struct extent_buffer *l); void btrfs_print_tree(struct extent_buffer *c, bool follow); +const char *btrfs_root_name(u64 objectid, char *buf); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c0f350c3a0cf..580899bdb991 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * - * Excl update is tricky, the update is split into 2 part. + * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 243a2e44526e..9d4f5316a7e8 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -767,31 +767,39 @@ static void reada_start_machine_worker(struct btrfs_work *work) kfree(rmw); } -static void __reada_start_machine(struct btrfs_fs_info *fs_info) +/* Try to start up to 10k READA requests for a group of devices */ +static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 enqueued; u64 total = 0; - int i; + struct btrfs_device *device; -again: do { enqueued = 0; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(device); } - mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } + return total; +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; + int i; + u64 enqueued = 0; + + mutex_lock(&fs_devices->device_list_mutex); + + enqueued += reada_start_for_fsdevs(fs_devices); + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + enqueued += reada_start_for_fsdevs(seed_devs); + + mutex_unlock(&fs_devices->device_list_mutex); if (enqueued == 0) return; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5cd02514cf4d..99aa87c08912 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -45,7 +45,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, return ret; } -static int copy_inline_to_page(struct inode *inode, +static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, @@ -58,6 +58,7 @@ static int copy_inline_to_page(struct inode *inode, char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct page *page = NULL; + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); @@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode, * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - file_offset, block_size); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); if (ret) goto out; - page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(inode->i_mapping)); + page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(mapping)); if (!page) { ret = -ENOMEM; goto out_unlock; } set_page_extent_mapped(page); - clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end, - 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -134,9 +134,9 @@ static int copy_inline_to_page(struct inode *inode, put_page(page); } if (ret) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - file_offset, block_size, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); @@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(dst, new_key->offset, + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; @@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -231,8 +231,8 @@ static int clone_copy_inline_extent(struct inode *dst, * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -439,7 +439,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; + struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b @@ -462,8 +462,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; - clone_info.item_size = size; - ret = btrfs_punch_hole_range(inode, path, drop_start, + clone_info.is_new_extent = false; + ret = btrfs_replace_file_extents(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) @@ -520,6 +520,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, ret = -EINTR; goto out; } + + cond_resched(); } ret = 0; @@ -533,7 +535,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, btrfs_release_path(path); path->leave_spinning = 0; - ret = btrfs_punch_hole_range(inode, path, last_dest_end, + ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4ba1ab9cc76d..3602806d71bd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1206,7 +1206,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, } if (cow) { - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1274,7 +1275,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); + slot, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * relocated and the block is tree root. */ leaf = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, + BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); if (ret < 0) @@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, - slot, &eb); + slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret < 0) { diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index c89697486366..702dc5441f03 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + if (!ret) { + spin_lock(&rsv->lock); + rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&rsv->lock); + } return ret; } -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); + struct btrfs_fs_info *fs_info = root->fs_info; + u64 qgroup_to_release; + + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); + btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 354ab9985a34..cf63f1e27a27 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int success; bool full_stripe_locked; unsigned int nofs_flag; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); @@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("i/o error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.verify_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d9813a5b075a..340c76a12ce1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -122,8 +122,6 @@ struct send_ctx { struct file_ra_state ra; - char *read_buf; - /* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of @@ -278,11 +276,6 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, @@ -584,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); - hdr->tlv_type = cpu_to_le16(attr); - hdr->tlv_len = cpu_to_le16(len); + put_unaligned_le16(attr, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); memcpy(hdr + 1, data, len); sctx->send_size += total_len; @@ -695,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->cmd = cpu_to_le16(cmd); + put_unaligned_le16(cmd, &hdr->cmd); return 0; } @@ -707,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx) u32 crc; hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); - hdr->crc = 0; + put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); + put_unaligned_le32(0, &hdr->crc); crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); - hdr->crc = cpu_to_le32(crc); + put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; return ret; @@ -3812,6 +3805,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) return 0; } +/* + * When processing the new references for an inode we may orphanize an existing + * directory inode because its old name conflicts with one of the new references + * of the current inode. Later, when processing another new reference of our + * inode, we might need to orphanize another inode, but the path we have in the + * reference reflects the pre-orphanization name of the directory we previously + * orphanized. For example: + * + * parent snapshot looks like: + * + * . (ino 256) + * |----- f1 (ino 257) + * |----- f2 (ino 258) + * |----- d1/ (ino 259) + * |----- d2/ (ino 260) + * + * send snapshot looks like: + * + * . (ino 256) + * |----- d1 (ino 258) + * |----- f2/ (ino 259) + * |----- f2_link/ (ino 260) + * | |----- f1 (ino 257) + * | + * |----- d2 (ino 258) + * + * When processing inode 257 we compute the name for inode 259 as "d1", and we + * cache it in the name cache. Later when we start processing inode 258, when + * collecting all its new references we set a full path of "d1/d2" for its new + * reference with name "d2". When we start processing the new references we + * start by processing the new reference with name "d1", and this results in + * orphanizing inode 259, since its old reference causes a conflict. Then we + * move on the next new reference, with name "d2", and we find out we must + * orphanize inode 260, as its old reference conflicts with ours - but for the + * orphanization we use a source path corresponding to the path we stored in the + * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the + * receiver fail since the path component "d1/" no longer exists, it was renamed + * to "o259-6-0/" when processing the previous new reference. So in this case we + * must recompute the path in the new reference and use it for the new + * orphanization operation. + */ +static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + char *name; + int ret; + + name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + fs_path_reset(ref->full_path); + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); + if (ret < 0) + goto out; + + ret = fs_path_add(ref->full_path, name, ref->name_len); + if (ret < 0) + goto out; + + /* Update the reference's base name pointer. */ + set_ref_path(ref, ref->full_path); +out: + kfree(name); + return ret; +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -3880,52 +3939,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) goto out; } + /* + * Before doing any rename and link operations, do a first pass on the + * new references to orphanize any unprocessed inodes that may have a + * reference that conflicts with one of the new references of the current + * inode. This needs to happen first because a new reference may conflict + * with the old reference of a parent directory, so we must make sure + * that the path used for link and rename commands don't use an + * orphanized name when an ancestor was not yet orphanized. + * + * Example: + * + * Parent snapshot: + * + * . (ino 256) + * |----- testdir/ (ino 259) + * | |----- a (ino 257) + * | + * |----- b (ino 258) + * + * Send snapshot: + * + * . (ino 256) + * |----- testdir_2/ (ino 259) + * | |----- a (ino 260) + * | + * |----- testdir (ino 257) + * |----- b (ino 257) + * |----- b2 (ino 258) + * + * Processing the new reference for inode 257 with name "b" may happen + * before processing the new reference with name "testdir". If so, we + * must make sure that by the time we send a link command to create the + * hard link "b", inode 259 was already orphanized, since the generated + * path in "valid_path" already contains the orphanized name for 259. + * We are processing inode 257, so only later when processing 259 we do + * the rename operation to change its temporary (orphanized) name to + * "testdir_2". + */ list_for_each_entry(cur, &sctx->new_refs, list) { - /* - * We may have refs where the parent directory does not exist - * yet. This happens if the parent directories inum is higher - * than the current inum. To handle this case, we create the - * parent directory out of order. But we need to check if this - * did already happen before due to other refs in the same dir. - */ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - if (ret == inode_state_will_create) { - ret = 0; - /* - * First check if any of the current inodes refs did - * already create the dir. - */ - list_for_each_entry(cur2, &sctx->new_refs, list) { - if (cur == cur2) - break; - if (cur2->dir == cur->dir) { - ret = 1; - break; - } - } - - /* - * If that did not happen, check if a previous inode - * did already create the dir. - */ - if (!ret) - ret = did_create_dir(sctx, cur->dir); - if (ret < 0) - goto out; - if (!ret) { - ret = send_create_inode(sctx, cur->dir); - if (ret < 0) - goto out; - } - } + if (ret == inode_state_will_create) + continue; /* - * Check if this new ref would overwrite the first ref of - * another unprocessed inode. If yes, orphanize the - * overwritten inode. If we find an overwritten ref that is - * not the first ref, simply unlink it. + * Check if this new ref would overwrite the first ref of another + * unprocessed inode. If yes, orphanize the overwritten inode. + * If we find an overwritten ref that is not the first ref, + * simply unlink it. */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, @@ -3942,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct name_cache_entry *nce; struct waiting_dir_move *wdm; + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) @@ -4004,6 +4073,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) } } + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * than the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { ret = wait_for_dest_dir_move(sctx, cur, is_orphan); if (ret < 0) @@ -4799,7 +4911,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx) return ret; } -static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static inline u64 max_send_read_size(const struct send_ctx *sctx) +{ + return sctx->send_max_size - SZ_16K; +} + +static int put_data_header(struct send_ctx *sctx, u32 len) +{ + struct btrfs_tlv_header *hdr; + + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + return 0; +} + +static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4809,21 +4939,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); - ssize_t ret = 0; + int ret; + + ret = put_data_header(sctx, len); + if (ret) + return ret; inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); - if (offset + len > i_size_read(inode)) { - if (offset > i_size_read(inode)) - len = 0; - else - len = offset - i_size_read(inode); - } - if (len == 0) - goto out; - last_index = (offset + len - 1) >> PAGE_SHIFT; /* initial readahead */ @@ -4864,16 +4989,16 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) } addr = kmap(page); - memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset, + cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - ret += cur_len; + sctx->send_size += cur_len; } -out: iput(inode); return ret; } @@ -4887,7 +5012,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - ssize_t num_read = 0; p = fs_path_alloc(); if (!p) @@ -4895,13 +5019,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - num_read = fill_read_buf(sctx, offset, len); - if (num_read <= 0) { - if (num_read < 0) - ret = num_read; - goto out; - } - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) goto out; @@ -4912,16 +5029,16 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + ret = put_file_data(sctx, offset, len); + if (ret < 0) + goto out; ret = send_cmd(sctx); tlv_put_failure: out: fs_path_free(p); - if (ret < 0) - return ret; - return num_read; + return ret; } /* @@ -5033,8 +5150,8 @@ static int send_update_extent(struct send_ctx *sctx, static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; + u64 read_size = max_send_read_size(sctx); u64 offset = sctx->cur_inode_last_extent; - u64 len; int ret = 0; /* @@ -5061,16 +5178,19 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); if (ret < 0) goto tlv_put_failure; - memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { - len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + u64 len = min(end - offset, read_size); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_data_header(sctx, len); + if (ret < 0) + break; + memset(sctx->send_buf + sctx->send_size, 0, len); + sctx->send_size += len; ret = send_cmd(sctx); if (ret < 0) break; @@ -5086,23 +5206,20 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); while (sent < len) { - u64 size = len - sent; + u64 size = min(len - sent, read_size); int ret; - if (size > BTRFS_SEND_READ_SIZE) - size = BTRFS_SEND_READ_SIZE; ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; - if (!ret) - break; - sent += ret; + sent += size; } return 0; } @@ -5402,51 +5519,29 @@ static int send_write_or_clone(struct send_ctx *sctx, struct clone_root *clone_root) { int ret = 0; - struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 len; - u8 type; + u64 end; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - /* - * it is possible the inline item won't cover the whole page, - * but there may be items after this page. Make - * sure to send the whole thing - */ - len = PAGE_ALIGN(len); - } else { - len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - } + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; - if (offset >= sctx->cur_inode_size) { - ret = 0; - goto out; - } - if (offset + len > sctx->cur_inode_size) - len = sctx->cur_inode_size - offset; - if (len == 0) { - ret = 0; - goto out; - } - - if (clone_root && IS_ALIGNED(offset + len, bs)) { + if (clone_root && IS_ALIGNED(end, bs)) { + struct btrfs_file_extent_item *ei; u64 disk_byte; u64 data_offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, len); + offset, end - offset); } else { - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, offset, end - offset); } - sctx->cur_inode_next_write_offset = offset + len; -out: + sctx->cur_inode_next_write_offset = end; return ret; } @@ -6692,8 +6787,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) + struct btrfs_root *right_root, void *ctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6960,8 +7054,7 @@ static int send_subvol(struct send_ctx *sctx) goto out; if (sctx->parent_root) { - ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, - changed_cb, sctx); + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; ret = finish_inode_if_needed(sctx, 1); @@ -7087,7 +7180,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; - unsigned alloc_size; + size_t alloc_size; int sort_clone_roots = 0; if (!capable(CAP_SYS_ADMIN)) @@ -7169,25 +7262,20 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } - sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } - alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + alloc_size = array_size(sizeof(*arg->clone_sources), + arg->clone_sources_count); if (arg->clone_sources_count) { clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); @@ -7378,7 +7466,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) kvfree(sctx->clone_roots); kvfree(sctx->send_buf); - kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index ead397f7034f..de91488b7cd0 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -13,7 +13,6 @@ #define BTRFS_SEND_STREAM_VERSION 1 #define BTRFS_SEND_BUF_SIZE SZ_64K -#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 475968ccbd1d..64099565ab8f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry(found, head, list) found->full = 0; - rcu_read_unlock(); } static int create_space_info(struct btrfs_fs_info *info, u64 flags) @@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (ret) return ret; - list_add_rcu(&space_info->list, &info->space_info); + list_add(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; @@ -290,22 +288,13 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); + list_for_each_entry(found, head, list) { + if (found->flags & flags) return found; - } } - rcu_read_unlock(); return NULL; } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) @@ -476,28 +465,6 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, up_read(&info->groups_sem); } -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { @@ -516,25 +483,33 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 to_reclaim, bool wait_ordered) { - struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 dio_bytes; - u64 async_pages; u64 items; long time_left; - unsigned long nr_pages; int loops; /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + if (to_reclaim == U64_MAX) { + items = U64_MAX; + } else { + /* + * to_reclaim is set to however much metadata we need to + * reclaim, but reclaiming that much data doesn't really track + * exactly, so increase the amount to reclaim by 2x in order to + * make sure we're flushing enough delalloc to hopefully reclaim + * some metadata reservations. + */ + items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + } trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -557,44 +532,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; - - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); - - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; - - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets) && - list_empty(&space_info->priority_tickets)) { - spin_unlock(&space_info->lock); - break; - } - spin_unlock(&space_info->lock); + btrfs_start_delalloc_roots(fs_info, items); loops++; if (wait_ordered && !trans) { @@ -604,6 +542,15 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, if (time_left) break; } + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); @@ -628,8 +575,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; - u64 bytes_needed; u64 reclaim_bytes = 0; + u64 bytes_needed = 0; u64 cur_free_bytes = 0; trans = (struct btrfs_trans_handle *)current->journal_info; @@ -649,7 +596,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; + if (ticket) + bytes_needed = ticket->bytes; if (bytes_needed > cur_free_bytes) bytes_needed -= cur_free_bytes; @@ -676,8 +624,10 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, goto commit; /* - * See if there is some space in the delayed insertion reservation for - * this reservation. + * See if there is some space in the delayed insertion reserve for this + * reservation. If the space_info's don't match (like for DATA or + * SYSTEM) then just go enospc, reclaiming this space won't recover any + * space to satisfy those reservations. */ if (space_info != delayed_rsv->space_info) goto enospc; @@ -742,7 +692,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + shrink_delalloc(fs_info, space_info, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: @@ -767,7 +717,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } ret = btrfs_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), + btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); @@ -1037,9 +987,132 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +/* + * FLUSH_DELALLOC_WAIT: + * Space is freed from flushing delalloc in one of two ways. + * + * 1) compression is on and we allocate less space than we reserved + * 2) we are overwriting existing space + * + * For #1 that extra space is reclaimed as soon as the delalloc pages are + * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent + * length to ->bytes_reserved, and subtracts the reserved space from + * ->bytes_may_use. + * + * For #2 this is trickier. Once the ordered extent runs we will drop the + * extent in the range we are overwriting, which creates a delayed ref for + * that freed extent. This however is not reclaimed until the transaction + * commits, thus the next stages. + * + * RUN_DELAYED_IPUTS + * If we are freeing inodes, we want to make sure all delayed iputs have + * completed, because they could have been on an inode with i_nlink == 0, and + * thus have been truncated and freed up space. But again this space is not + * immediately re-usable, it comes in the form of a delayed ref, which must be + * run and then the transaction must be committed. + * + * FLUSH_DELAYED_REFS + * The above two cases generate delayed refs that will affect + * ->total_bytes_pinned. However this counter can be inconsistent with + * reality if there are outstanding delayed refs. This is because we adjust + * the counter based solely on the current set of delayed refs and disregard + * any on-disk state which might include more refs. So for example, if we + * have an extent with 2 references, but we only drop 1, we'll see that there + * is a negative delayed ref count for the extent and assume that the space + * will be freed, and thus increase ->total_bytes_pinned. + * + * Running the delayed refs gives us the actual real view of what will be + * freed at the transaction commit time. This stage will not actually free + * space for us, it just makes sure that may_commit_transaction() has all of + * the information it needs to make the right decision. + * + * COMMIT_TRANS + * This is where we reclaim all of the pinned space generated by the previous + * two stages. We will not commit the transaction if we don't think we're + * likely to satisfy our request, which means if our current free space + + * total_bytes_pinned < reservation we will not commit. This is why the + * previous states are actually important, to make sure we know for sure + * whether committing the transaction will allow us to make progress. + * + * ALLOC_CHUNK_FORCE + * For data we start with alloc chunk force, however we could have been full + * before, and then the transaction commit could have freed new block groups, + * so if we now have space to allocate do the force chunk allocation. + */ +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_WAIT, + RUN_DELAYED_IPUTS, + FLUSH_DELAYED_REFS, + COMMIT_TRANS, + ALLOC_CHUNK_FORCE, +}; + +static void btrfs_async_reclaim_data_space(struct work_struct *work) { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 last_tickets_id; + int flush_state = 0; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + } + + while (flush_state < ARRAY_SIZE(data_flush_states)) { + flush_space(fs_info, space_info, U64_MAX, + data_flush_states[flush_state]); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = 0; + } + + if (flush_state >= ARRAY_SIZE(data_flush_states)) { + if (space_info->full) { + if (maybe_fail_all_tickets(fs_info, space_info)) + flush_state = 0; + else + space_info->flush = 0; + } else { + flush_state = 0; + } + } + spin_unlock(&space_info->lock); + } +} + +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) +{ + INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1089,6 +1162,21 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } while (flush_state < states_nr); } +static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket) +{ + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } +} + static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) @@ -1141,6 +1229,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, int ret; switch (flush) { + case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: wait_reserve_ticket(fs_info, space_info, ticket); @@ -1155,6 +1244,9 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; + case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: + priority_reclaim_data_space(fs_info, space_info, ticket); + break; default: ASSERT(0); break; @@ -1214,11 +1306,11 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { + struct work_struct *async_work; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -1227,6 +1319,11 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, ASSERT(orig_bytes); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + if (flush == BTRFS_RESERVE_FLUSH_DATA) + async_work = &fs_info->async_data_reclaim_work; + else + async_work = &fs_info->async_reclaim_work; + spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); @@ -1268,7 +1365,8 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); if (flush == BTRFS_RESERVE_FLUSH_ALL || - flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { space_info->flush = 1; @@ -1276,8 +1374,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + queue_work(system_unbound_wq, async_work); } } else { list_add_tail(&ticket.list, @@ -1329,8 +1426,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush); + ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -1348,3 +1444,32 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } return ret; } + +/** + * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation + * @fs_info - the filesystem + * @bytes - the number of bytes we need + * @flush - how we are allowed to flush + * + * This will reserve bytes from the data space info. If there is not enough + * space then we will attempt to flush space as specified by flush. + */ +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + int ret; + + ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + + ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", + data_sinfo->flags, bytes, 1); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + } + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c3c64019950a..5646393b928c 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); #endif /* BTRFS_SPACE_INFO_H */ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 079b059818e9..c46be27be700 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -7,16 +7,6 @@ #include "ctree.h" -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 25967ecaaf0a..8840a4fa81eb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1871,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * the filesystem is busy. */ cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); btrfs_discard_cleanup(fs_info); @@ -2163,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; int mixed = 0; - rcu_read_lock(); - list_for_each_entry_rcu(found, &fs_info->space_info, list) { + list_for_each_entry(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; @@ -2193,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) total_used += found->disk_used; } - rcu_read_unlock(); - buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); buf->f_blocks >>= bits; buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5be30066563c..279d9262b676 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -14,6 +14,7 @@ #include "ctree.h" #include "discard.h" #include "disk-io.h" +#include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" @@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); +} +BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), + BTRFS_ATTR_PTR(static_feature, send_stream_version), NULL }; @@ -809,6 +818,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, BTRFS_ATTR(, checksum, btrfs_checksum_show); +static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + const char *str; + + switch (READ_ONCE(fs_info->exclusive_operation)) { + case BTRFS_EXCLOP_NONE: + str = "none\n"; + break; + case BTRFS_EXCLOP_BALANCE: + str = "balance\n"; + break; + case BTRFS_EXCLOP_DEV_ADD: + str = "device add\n"; + break; + case BTRFS_EXCLOP_DEV_REMOVE: + str = "device remove\n"; + break; + case BTRFS_EXCLOP_DEV_REPLACE: + str = "device replace\n"; + break; + case BTRFS_EXCLOP_RESIZE: + str = "resize\n"; + break; + case BTRFS_EXCLOP_SWAP_ACTIVATE: + str = "swap activate\n"; + break; + default: + str = "UNKNOWN\n"; + break; + } + return scnprintf(buf, PAGE_SIZE, "%s", str); +} +BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -817,6 +862,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), + BTRFS_ATTR_PTR(, exclusive_operation), NULL, }; @@ -935,12 +981,24 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } +static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) + btrfs_sysfs_remove_device(device); + } +} + void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; - btrfs_reset_fs_info_ptr(fs_info); - sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { @@ -964,7 +1022,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -973,7 +1031,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_INCOMPAT] = "incompat", }; -const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } @@ -1079,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + /* + * We call this either on mount, or if we've created a block group for a + * new index type while running (i.e. when restriping). The running + * case is tricky because we could race with other threads, so we need + * to have this check to make sure we didn't already init the kobject. + * + * We don't have to protect on the free side because it only happens on + * unmount. + */ + spin_lock(&space_info->lock); + if (space_info->block_group_kobjs[index]) { + spin_unlock(&space_info->lock); + kobject_put(&rkobj->kobj); + return; + } else { + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + spin_unlock(&space_info->lock); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { + spin_lock(&space_info->lock); + space_info->block_group_kobjs[index] = NULL; + spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } - - space_info->block_group_kobjs[index] = &rkobj->kobj; } /* @@ -1151,52 +1230,30 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -/* when one_device is NULL, it removes all device links */ - -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; + struct kobject *devices_kobj; - if (!fs_devices->devices_kobj) - return -EINVAL; + /* + * Seed fs_devices devices_kobj aren't used, fetch kobject from the + * fs_info::fs_devices. + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + ASSERT(devices_kobj); - if (one_device) { - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - - if (one_device->devid_kobj.state_initialized) { - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); - } - - return 0; + if (device->bdev) { + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(devices_kobj, disk_kobj->name); } - list_for_each_entry(one_device, &fs_devices->devices, dev_list) { - - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - if (one_device->devid_kobj.state_initialized) { - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); - } + if (device->devid_kobj.state_initialized) { + kobject_del(&device->devid_kobj); + kobject_put(&device->devid_kobj); + wait_for_completion(&device->kobj_unregister); } - - return 0; } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, @@ -1277,44 +1334,80 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_device *device) { - int error = 0; - struct btrfs_device *dev; + int ret; unsigned int nofs_flag; + struct kobject *devices_kobj; + struct kobject *devinfo_kobj; + + /* + * Make sure we use the fs_info::fs_devices to fetch the kobjects even + * for the seed fs_devices + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; + ASSERT(devices_kobj); + ASSERT(devinfo_kobj); nofs_flag = memalloc_nofs_save(); - list_for_each_entry(dev, &fs_devices->devices, dev_list) { - if (one_device && one_device != dev) - continue; + if (device->bdev) { + struct hd_struct *disk; + struct kobject *disk_kobj; - if (dev->bdev) { - struct hd_struct *disk; - struct kobject *disk_kobj; + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; - disk = dev->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - - error = sysfs_create_link(fs_devices->devices_kobj, - disk_kobj, disk_kobj->name); - if (error) - break; - } - - init_completion(&dev->kobj_unregister); - error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devinfo_kobj, "%llu", - dev->devid); - if (error) { - kobject_put(&dev->devid_kobj); - break; + ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); + if (ret) { + btrfs_warn(device->fs_info, + "creating sysfs device link for devid %llu failed: %d", + device->devid, ret); + goto out; } } - memalloc_nofs_restore(nofs_flag); - return error; + init_completion(&device->kobj_unregister); + ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, + devinfo_kobj, "%llu", device->devid); + if (ret) { + kobject_put(&device->devid_kobj); + btrfs_warn(device->fs_info, + "devinfo init for devid %llu failed: %d", + device->devid, ret); + } + +out: + memalloc_nofs_restore(nofs_flag); + return ret; +} + +static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_device *device; + struct btrfs_fs_devices *seed; + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + goto fail; + } + } + + return 0; + +fail: + btrfs_sysfs_remove_fs_devices(fs_devices); + return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) @@ -1328,8 +1421,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) &disk_to_dev(bdev->bd_disk)->kobj); } -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid) +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) + { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; @@ -1337,7 +1430,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); @@ -1404,15 +1497,13 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - btrfs_set_fs_info_ptr(fs_info); - - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); + error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_remove_devices_dir(fs_devs, NULL); + btrfs_sysfs_remove_fs_devices(fs_devs); return error; } @@ -1630,12 +1721,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 features; - int ret; + u64 __maybe_unused features; + int __maybe_unused ret; if (!fs_info) return; + /* + * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not + * safe when called from some contexts (eg. balance) + */ features = get_features(fs_info, set); ASSERT(bit & supported_feature_masks[set]); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index cf839c46a131..bacef43f7267 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -13,15 +13,12 @@ enum btrfs_feature_set { }; char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -const char * const btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +const char *btrfs_feature_set_name(enum btrfs_feature_set set); +int btrfs_sysfs_add_device(struct btrfs_device *device); +void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index a1b9f9b5978e..df54cdfdc250 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, &key, &value_len, 1); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 894a63a92236..e6719f7db386 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); } /* @@ -951,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } BTRFS_I(inode)->root = root; - btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d2fc292ac61b..52ada47aff50 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -292,6 +292,8 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, } cur_trans->fs_info = fs_info; + atomic_set(&cur_trans->pending_ordered, 0); + init_waitqueue_head(&cur_trans->pending_wait); atomic_set(&cur_trans->num_writers, 1); extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); @@ -1182,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, - 0, &eb); + 0, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -1587,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); @@ -2165,6 +2168,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(trans); + /* + * Wait for all ordered extents started by a fast fsync that joined this + * transaction. Otherwise if this transaction commits before the ordered + * extents complete we lose logged data after a power failure. + */ + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); + btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d60b055b8695..858d9153a1cd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -85,6 +85,13 @@ struct btrfs_transaction { spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; struct btrfs_fs_info *fs_info; + + /* + * Number of ordered extents the transaction must wait for before + * committing. These are ordered extents started by a fast fsync. + */ + atomic_t pending_ordered; + wait_queue_head_t pending_wait; }; #define __TRANS_FREEZABLE (1U << 0) @@ -105,6 +112,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) +#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b1fee630f97..f0ffd5ee77bd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_root_item ri; + struct btrfs_root_item ri = { 0 }; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; int ret; @@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (ret < 0) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { generic_err(leaf, slot, - "invalid root item size, have %u expect %zu", - btrfs_item_size_nr(leaf, slot), sizeof(ri)); + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); } + /* + * For legacy root item, the members starting at generation_v2 will be + * all filled with 0. + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - sizeof(ri)); + btrfs_item_size_nr(leaf, slot)); /* Generation related */ if (btrfs_root_generation(&ri) > diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 39da9db35278..56cbc1706b6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -96,8 +96,6 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -176,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); - if (ctx) { + if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -215,9 +213,7 @@ static int join_running_log_trans(struct btrfs_root *root) */ void btrfs_pin_log_trans(struct btrfs_root *root) { - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); } /* @@ -3615,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * search and this search we'll not find the key again and can just * bail. */ +search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret != 0) goto done; @@ -3634,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (min_key.objectid != ino || min_key.type != key_type) goto done; + + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } + ret = overwrite_item(trans, log, dst_path, src, i, &min_key); if (ret) { @@ -4082,10 +4086,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, - const struct extent_map *em) + const struct extent_map *em, + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; u64 csum_offset; u64 csum_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; LIST_HEAD(ordered_sums); int ret = 0; @@ -4094,13 +4102,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, em->block_start == EXTENT_MAP_HOLE) return 0; + list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { + const u64 ordered_end = ordered->file_offset + ordered->num_bytes; + const u64 mod_end = mod_start + mod_len; + struct btrfs_ordered_sum *sums; + + if (mod_len == 0) + break; + + if (ordered_end <= mod_start) + continue; + if (mod_end <= ordered->file_offset) + break; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this ordered + * extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered_end >= mod_end) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered_end < mod_end) { + mod_len = mod_end - ordered_end; + mod_start = ordered_end; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) + continue; + + list_for_each_entry(sums, &ordered->list, list) { + ret = log_csums(trans, inode, log_root, sums); + if (ret) + return ret; + } + } + + /* We're done, found all csums in the ordered extents. */ + if (mod_len == 0) + return 0; + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = em->mod_start - em->start; - csum_len = em->mod_len; + csum_offset = mod_start - em->start; + csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ @@ -4140,7 +4206,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int extent_inserted = 0; - ret = log_extent_csums(trans, inode, log, em); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4342,10 +4408,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - const u64 start, - const u64 end) + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; @@ -4359,23 +4425,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { - /* - * Skip extents outside our logging range. It's important to do - * it for correctness because if we don't ignore them, we may - * log them before their ordered extent completes, and therefore - * we could log them without logging their respective checksums - * (the checksum items are added to the csum tree at the very - * end of btrfs_finish_ordered_io()). Also leave such extents - * outside of our range in the list, since we may have another - * ranged fsync in the near future that needs them. If an extent - * outside our range corresponds to a hole, log it to avoid - * leaving gaps between extents (fsck will complain when we are - * not using the NO_HOLES feature). - */ - if ((em->start > end || em->start + em->len <= start) && - em->block_start != EXTENT_MAP_HOLE) - continue; - list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4434,8 +4483,32 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) + return ret; - return ret; + /* + * We have logged all extents successfully, now make sure the commit of + * the current transaction waits for the ordered extents to complete + * before it commits and wipes out the log trees, otherwise we would + * lose data if an ordered extents completes after the transaction + * commits and a power failure happens after the transaction commit. + */ + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); + + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + spin_lock_irq(&inode->ordered_tree.lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&inode->ordered_tree.lock); + } + btrfs_put_ordered_extent(ordered); + } + + return 0; } static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, @@ -4841,7 +4914,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE_ALL, - 0, LLONG_MAX, ctx); + ctx); btrfs_add_delayed_iput(inode); } } @@ -4883,7 +4956,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of * the inode is not updated when we only log that it exists and - * and it has the full sync bit set (see btrfs_log_inode()). + * it has the full sync bit set (see btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -4899,7 +4972,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, 0, LLONG_MAX, ctx); + LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5112,8 +5185,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct btrfs_path *path; @@ -5292,7 +5363,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx, start, end); + ctx); if (ret) { err = ret; goto out_unlock; @@ -5301,31 +5372,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct extent_map *em, *n; write_lock(&em_tree->lock); - /* - * We can't just remove every em if we're called for a ranged - * fsync - that is, one that doesn't cover the whole possible - * file range (0 to LLONG_MAX). This is because we can have - * em's that fall outside the range we're logging and therefore - * their ordered operations haven't completed yet - * (btrfs_finish_ordered_io() not invoked yet). This means we - * didn't get their respective file extent item in the fs/subvol - * tree yet, and need to let the next fast fsync (one which - * consults the list of modified extent maps) find the em so - * that it logs a matching file extent item and waits for the - * respective ordered operation to complete (if it's still - * running). - * - * Removing every em outside the range we're logging would make - * the next fast fsync not log their matching file extent items, - * therefore making us lose data after a log replay. - */ - list_for_each_entry_safe(em, n, &em_tree->modified_extents, - list) { - const u64 mod_end = em->mod_start + em->mod_len - 1; - - if (em->mod_start >= start && mod_end <= end) - list_del_init(&em->list); - } + list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) + list_del_init(&em->list); write_unlock(&em_tree->lock); } @@ -5339,19 +5387,34 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* - * Don't update last_log_commit if we logged that an inode exists after - * it was loaded to memory (full_sync bit set). - * This is to prevent data loss when we do a write to the inode, then - * the inode gets evicted after all delalloc was flushed, then we log - * it exists (due to a rename for example) and then fsync it. This last - * fsync would do nothing (not logging the extents previously written). + * If we are logging that an ancestor inode exists as part of logging a + * new name from a link or rename operation, don't mark the inode as + * logged - otherwise if an explicit fsync is made against an ancestor, + * the fsync considers the inode in the log and doesn't sync the log, + * resulting in the ancestor missing after a power failure unless the + * log was synced as part of an fsync against any other unrelated inode. + * So keep it simple for this case and just don't flag the ancestors as + * logged. */ - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - if (inode_only != LOG_INODE_EXISTS || - !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); + if (!ctx || + !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && + &inode->vfs_inode != ctx->inode)) { + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists + * after it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, + * then the inode gets evicted after all delalloc was flushed, + * then we log it exists (due to a rename for example) and then + * fsync it. This last fsync would do nothing (not logging the + * extents previously written). + */ + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } out_unlock: mutex_unlock(&inode->log_mutex); @@ -5591,7 +5654,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), - log_mode, 0, LLONG_MAX, ctx); + log_mode, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; @@ -5735,7 +5798,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), - LOG_INODE_ALL, 0, LLONG_MAX, ctx); + LOG_INODE_ALL, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) ret = 1; @@ -5786,8 +5849,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > last_committed) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_INODE_EXISTS, - 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -5842,7 +5904,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation > fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); if (ret) break; } @@ -5950,8 +6012,6 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - const loff_t start, - const loff_t end, int inode_only, struct btrfs_log_ctx *ctx) { @@ -6004,7 +6064,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); + ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6100,15 +6160,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, - start, end, LOG_INODE_ALL, ctx); + LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -6371,26 +6429,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. - * - * @ctx can not be NULL when @sync_log is false, and should be NULL when it's - * true (because it's not used). - * - * Return value depends on whether @sync_log is true or false. - * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT - * otherwise. - * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, - * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed (without attempting to sync the log). */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx) + struct dentry *parent) { struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; + struct btrfs_log_ctx ctx; /* * this will force the logging code to walk the dentry chain @@ -6405,34 +6450,17 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : - BTRFS_DONT_NEED_LOG_SYNC; + return; - if (sync_log) { - struct btrfs_log_ctx ctx2; - - btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx2); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_TRANS_COMMIT; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - ret = btrfs_sync_log(trans, inode->root, &ctx2); - if (ret) - return BTRFS_NEED_TRANS_COMMIT; - return BTRFS_DONT_NEED_TRANS_COMMIT; - } - - ASSERT(ctx); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, ctx); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_LOG_SYNC; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - return BTRFS_NEED_LOG_SYNC; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 132e43d29034..731bd9c029f5 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -16,8 +16,11 @@ struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; + bool logging_new_name; struct inode *inode; struct list_head list; + /* Only used for fast fsyncs. */ + struct list_head ordered_extents; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -26,8 +29,23 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_ret = 0; ctx->log_transid = 0; ctx->log_new_dentries = false; + ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); +} + +static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } } static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) @@ -49,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -67,16 +83,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); -/* Return values for btrfs_log_new_name() */ -enum { - BTRFS_DONT_NEED_TRANS_COMMIT, - BTRFS_NEED_TRANS_COMMIT, - BTRFS_DONT_NEED_LOG_SYNC, - BTRFS_NEED_LOG_SYNC, -}; -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx); + struct dentry *parent); #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1997a7d67f22..58b9c419a2b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -291,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * balance_mutex * * - * Exclusive operations, BTRFS_FS_EXCL_OP - * ====================================== + * Exclusive operations + * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. @@ -318,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * The exclusive status is cleared when the device operation is canceled or * completed. */ @@ -356,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); + INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); @@ -406,7 +407,7 @@ void __exit btrfs_cleanup_fs_uuids(void) * Returned struct is not linked onto any lists and must be destroyed using * btrfs_free_device. */ -static struct btrfs_device *__alloc_device(void) +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *dev; @@ -433,7 +434,8 @@ static struct btrfs_device *__alloc_device(void) btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); return dev; } @@ -593,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path, btrfs_free_device(device); ret = 0; - if (fs_devices->num_devices == 0) - break; } mutex_unlock(&fs_devices->device_list_mutex); @@ -941,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, bdput(path_bdev); mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(device->fs_info, - "duplicate device fsid:devid for %pU:%llu old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "duplicate device %s devid %llu generation %llu scanned by %s (%d)", + path, devid, found_transid, + current->comm, + task_pid_nr(current)); return ERR_PTR(-EEXIST); } bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, - "device fsid %pU devid %llu moved old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "devid %llu device path %s changed to %s scanned by %s (%d)", + devid, rcu_str_deref(device->name), + path, current->comm, + task_pid_nr(current)); } name = rcu_string_strdup(path, GFP_NOFS); @@ -1035,28 +1037,21 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) return ERR_PTR(ret); } -/* - * After we have read the system tree and know devids belonging to - * this filesystem, remove the device which does not belong there. - */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, + int step, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; - struct btrfs_device *latest_dev = NULL; - mutex_lock(&uuid_mutex); -again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state) && + &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && - (!latest_dev || - device->generation > latest_dev->generation)) { - latest_dev = device; + (!*latest_dev || + device->generation > (*latest_dev)->generation)) { + *latest_dev = device; } continue; } @@ -1094,10 +1089,22 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) btrfs_free_device(device); } - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } +} + +/* + * After we have read the system tree and know devids belonging to this + * filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +{ + struct btrfs_device *latest_dev = NULL; + struct btrfs_fs_devices *seed_dev; + + mutex_lock(&uuid_mutex); + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); + + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; @@ -1149,47 +1156,41 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(atomic_read(&device->reada_in_flight) == 0); } -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; - if (--fs_devices->opened > 0) - return 0; + lockdep_assert_held(&uuid_mutex); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { + if (--fs_devices->opened > 0) + return; + + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); - } - mutex_unlock(&fs_devices->device_list_mutex); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; - - return 0; + fs_devices->fs_info = NULL; } -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_devices *seed_devices = NULL; - int ret; + LIST_HEAD(list); + struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); - ret = close_fs_devices(fs_devices); - if (!fs_devices->opened) { - seed_devices = fs_devices->seed; - fs_devices->seed = NULL; - } - mutex_unlock(&uuid_mutex); + close_fs_devices(fs_devices); + if (!fs_devices->opened) + list_splice_init(&fs_devices->seed_list, &list); - while (seed_devices) { - fs_devices = seed_devices; - seed_devices = fs_devices->seed; + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); + list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } - return ret; + mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, @@ -1197,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; + struct btrfs_device *tmp_device; flags |= FMODE_EXCL; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - /* Just open everything we can; ignore failures here */ - if (btrfs_open_one_device(fs_devices, device, flags, holder)) - continue; + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret; - if (!latest_dev || - device->generation > latest_dev->generation) + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret == 0 && + (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; + } else if (ret == -ENODATA) { + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + } } if (fs_devices->open_devices == 0) return -EINVAL; @@ -1961,16 +1968,13 @@ static struct btrfs_device * btrfs_find_next_active_device( * this_dev) which is active. */ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, - struct btrfs_device *this_dev) + struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_device *next_device; - if (this_dev) - next_device = this_dev; - else + if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, - device); + device); ASSERT(next_device); if (fs_info->sb->s_bdev && @@ -2040,7 +2044,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2144,7 +2148,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2165,14 +2169,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_free_device(device); if (cur_devices->open_devices == 0) { - while (fs_devices) { - if (fs_devices->seed == cur_devices) { - fs_devices->seed = cur_devices->seed; - break; - } - fs_devices = fs_devices->seed; - } - cur_devices->seed = NULL; + list_del_init(&cur_devices->seed_list); close_fs_devices(cur_devices); free_fs_devices(cur_devices); } @@ -2221,7 +2218,6 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { - struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; mutex_lock(&uuid_mutex); @@ -2232,8 +2228,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { - struct btrfs_fs_devices *tmp_fs_devices; - /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace @@ -2242,15 +2236,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) */ ASSERT(fs_devices->seeding); - tmp_fs_devices = fs_info->fs_devices; - while (tmp_fs_devices) { - if (tmp_fs_devices->seed == fs_devices) { - tmp_fs_devices->seed = fs_devices->seed; - break; - } - tmp_fs_devices = tmp_fs_devices->seed; - } - fs_devices->seed = NULL; + list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } @@ -2263,7 +2249,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); + btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2372,10 +2358,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; + /* + * Private copy of the seed devices, anchored at + * fs_info->fs_devices->seed_list + */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); + /* + * It's necessary to retain a copy of the original seed fs_devices in + * fs_uuids so that filesystems which have been seeded can successfully + * reference the seed device from open_seed_devices. This also supports + * multiple fs seed. + */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); @@ -2396,16 +2392,12 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; - mutex_lock(&fs_info->chunk_mutex); - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; fs_devices->rotating = false; - fs_devices->seed = seed_devices; + list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -2508,7 +2500,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 orig_super_num_devices; int seeding_dev = 0; int ret = 0; - bool unlocked = false; + bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; @@ -2522,20 +2514,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); + locked = true; } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; - mutex_unlock( - &fs_devices->device_list_mutex); + rcu_read_unlock(); goto error; } } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { @@ -2610,9 +2602,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices + 1); - /* add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); - /* * we've got more storage, clear any full flags on the space * infos @@ -2620,6 +2609,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); + + /* Add sysfs device entry */ + btrfs_sysfs_add_device(device); + mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { @@ -2645,8 +2638,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_sysfs; } - btrfs_sysfs_update_sprout_fsid(fs_devices, - fs_info->fs_devices->fsid); + /* + * fs_devices now represents the newly sprouted filesystem and + * its fsid has been changed by btrfs_prepare_sprout + */ + btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); @@ -2654,7 +2650,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); - unlocked = true; + locked = false; if (ret) /* transaction commit */ return ret; @@ -2689,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); @@ -2715,7 +2711,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev && !unlocked) { + if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } @@ -4042,7 +4038,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, /* * rw_devices will not change at the moment, device add/delete/replace - * are excluded by EXCL_OP + * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; @@ -4178,7 +4174,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); @@ -4189,7 +4185,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, reset_balance_state(fs_info); else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -4291,7 +4287,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4373,7 +4369,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) if (fs_info->balance_ctl) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } @@ -6458,11 +6454,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, bool seed) { struct btrfs_device *device; + struct btrfs_fs_devices *seed_devs; - while (fs_devices) { + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } + } + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!fsid || - !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &fs_devices->devices, + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid, @@ -6470,11 +6476,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, return device; } } - if (seed) - fs_devices = fs_devices->seed; - else - return NULL; } + return NULL; } @@ -6529,7 +6532,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(); + dev = __alloc_device(fs_info); if (IS_ERR(dev)) return dev; @@ -6725,13 +6728,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, lockdep_assert_held(&uuid_mutex); ASSERT(fsid); - fs_devices = fs_info->fs_devices->seed; - while (fs_devices) { + /* This will match only for multi-device seed fs */ + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; - fs_devices = fs_devices->seed; - } fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { @@ -6747,6 +6748,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, return fs_devices; } + /* + * Upon first call for a seed fs fsid, just create a private copy of the + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list + */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; @@ -6754,20 +6759,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); - fs_devices = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); - fs_devices = ERR_PTR(-EINVAL); - goto out; + return ERR_PTR(-EINVAL); } - fs_devices->seed = fs_info->fs_devices->seed; - fs_info->fs_devices->seed = fs_devices; -out: + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); + return fs_devices; } @@ -7186,17 +7188,22 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - while (fs_devices) { - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); + fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) + device->fs_info = fs_info; + + seed_devs->fs_info = fs_info; } + mutex_unlock(&fs_devices->device_list_mutex); } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, @@ -7222,17 +7229,53 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } +static int btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) +{ + struct btrfs_dev_stats_item *ptr; + struct extent_buffer *eb; + struct btrfs_key key; + int item_size; + int i, ret, slot; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); + if (ret) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); + device->dev_stats_valid = 1; + btrfs_release_path(path); + return ret < 0 ? ret : 0; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_set(device, i, 0); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); + + return 0; +} + int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { - struct btrfs_key key; - struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct extent_buffer *eb; - int slot; - int ret = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; - int i; + int ret = 0; path = btrfs_alloc_path(); if (!path) @@ -7240,43 +7283,22 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - int item_size; - struct btrfs_dev_stats_item *ptr; - - key.objectid = BTRFS_DEV_STATS_OBJECTID; - key.type = BTRFS_PERSISTENT_ITEM_KEY; - key.offset = device->devid; - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); - if (ret) { - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_set(device, i, 0); - device->dev_stats_valid = 1; - btrfs_release_path(path); - continue; - } - slot = path->slots[0]; - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); - - ptr = btrfs_item_ptr(eb, slot, - struct btrfs_dev_stats_item); - - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { - if (item_size >= (1 + i) * sizeof(__le64)) - btrfs_dev_stat_set(device, i, - btrfs_dev_stats_value(eb, ptr, i)); - else - btrfs_dev_stat_set(device, i, 0); - } - - device->dev_stats_valid = 1; - btrfs_dev_stat_print_on_load(device); - btrfs_release_path(path); + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + } +out: mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); - return ret < 0 ? ret : 0; + return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, @@ -7493,24 +7515,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) mutex_unlock(&trans->fs_info->chunk_mutex); } -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; - } -} - -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = NULL; - fs_devices = fs_devices->seed; - } -} - /* * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. */ @@ -7591,8 +7595,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, - NULL, false); + struct btrfs_fs_devices *devs; + + devs = list_first_entry(&fs_info->fs_devices->seed_list, + struct btrfs_fs_devices, seed_list); + dev = btrfs_find_device(devs, devid, NULL, NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 302c9234f7d0..bf27ac07d315 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string *name; + struct rcu_string __rcu *name; u64 generation; @@ -246,7 +246,7 @@ struct btrfs_fs_devices { */ struct list_head alloc_list; - struct btrfs_fs_devices *seed; + struct list_head seed_list; bool seeding; int opened; @@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); int btrfs_forget_devices(const char *path); -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); @@ -569,8 +569,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, diff --git a/fs/direct-io.c b/fs/direct-io.c index 780f09f1e2c4..0353b322a00c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -387,25 +387,6 @@ static void dio_bio_end_io(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index f82a4952769d..ee92634196a8 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,6 +4,7 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP + select SRCU help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 47f0b98b707f..49c5f9407098 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, int check_zero, + int *info_field, bool (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, if (rc) return rc; - if (check_zero && !x) + if (check_cb && check_cb(x)) return -EINVAL; *cl_field = x; @@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, return len; } -#define CLUSTER_ATTR(name, check_zero) \ +#define CLUSTER_ATTR(name, check_cb) \ static ssize_t cluster_##name##_store(struct config_item *item, \ const char *buf, size_t len) \ { \ struct dlm_cluster *cl = config_item_to_cluster(item); \ return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ - check_zero, buf, len); \ + check_cb, buf, len); \ } \ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ @@ -161,20 +161,30 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ } \ CONFIGFS_ATTR(cluster_, name); -CLUSTER_ATTR(tcp_port, 1); -CLUSTER_ATTR(buffer_size, 1); -CLUSTER_ATTR(rsbtbl_size, 1); -CLUSTER_ATTR(recover_timer, 1); -CLUSTER_ATTR(toss_secs, 1); -CLUSTER_ATTR(scan_secs, 1); -CLUSTER_ATTR(log_debug, 0); -CLUSTER_ATTR(log_info, 0); -CLUSTER_ATTR(protocol, 0); -CLUSTER_ATTR(mark, 0); -CLUSTER_ATTR(timewarn_cs, 1); -CLUSTER_ATTR(waitwarn_us, 0); -CLUSTER_ATTR(new_rsb_count, 0); -CLUSTER_ATTR(recover_callbacks, 0); +static bool dlm_check_zero(unsigned int x) +{ + return !x; +} + +static bool dlm_check_buffer_size(unsigned int x) +{ + return (x < DEFAULT_BUFFER_SIZE); +} + +CLUSTER_ATTR(tcp_port, dlm_check_zero); +CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); +CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); +CLUSTER_ATTR(recover_timer, dlm_check_zero); +CLUSTER_ATTR(toss_secs, dlm_check_zero); +CLUSTER_ATTR(scan_secs, dlm_check_zero); +CLUSTER_ATTR(log_debug, NULL); +CLUSTER_ATTR(log_info, NULL); +CLUSTER_ATTR(protocol, NULL); +CLUSTER_ATTR(mark, NULL); +CLUSTER_ATTR(timewarn_cs, dlm_check_zero); +CLUSTER_ATTR(waitwarn_us, NULL); +CLUSTER_ATTR(new_rsb_count, NULL); +CLUSTER_ATTR(recover_callbacks, NULL); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, @@ -221,6 +231,7 @@ struct dlm_space { struct list_head members; struct mutex members_lock; int members_count; + struct dlm_nodes *nds; }; struct dlm_comms { @@ -430,6 +441,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) INIT_LIST_HEAD(&sp->members); mutex_init(&sp->members_lock); sp->members_count = 0; + sp->nds = nds; return &sp->group; fail: @@ -451,6 +463,7 @@ static void drop_space(struct config_group *g, struct config_item *i) static void release_space(struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); + kfree(sp->nds); kfree(sp); } @@ -857,18 +870,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -int dlm_comm_mark(int nodeid, unsigned int *mark) +void dlm_comm_mark(int nodeid, unsigned int *mark) { struct dlm_comm *cm; cm = get_comm(nodeid); - if (!cm) - return -ENOENT; + if (!cm) { + *mark = dlm_config.ci_mark; + return; + } + + if (cm->mark) + *mark = cm->mark; + else + *mark = dlm_config.ci_mark; - *mark = cm->mark; put_comm(cm); - - return 0; } int dlm_our_nodeid(void) @@ -889,7 +906,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) /* Config file defaults */ #define DEFAULT_TCP_PORT 21064 -#define DEFAULT_BUFFER_SIZE 4096 #define DEFAULT_RSBTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 diff --git a/fs/dlm/config.h b/fs/dlm/config.h index f62996cad561..c210250a2581 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -12,6 +12,8 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ +#define DEFAULT_BUFFER_SIZE 4096 + struct dlm_config_node { int nodeid; int weight; @@ -46,7 +48,7 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -int dlm_comm_mark(int nodeid, unsigned int *mark); +void dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5050fe05769b..79f56f16bc2c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -65,40 +65,6 @@ #define MAX_SEND_MSG_COUNT 25 #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) -struct cbuf { - unsigned int base; - unsigned int len; - unsigned int mask; -}; - -static void cbuf_add(struct cbuf *cb, int n) -{ - cb->len += n; -} - -static int cbuf_data(struct cbuf *cb) -{ - return ((cb->base + cb->len) & cb->mask); -} - -static void cbuf_init(struct cbuf *cb, int size) -{ - cb->base = cb->len = 0; - cb->mask = size-1; -} - -static void cbuf_eat(struct cbuf *cb, int n) -{ - cb->len -= n; - cb->base += n; - cb->base &= cb->mask; -} - -static bool cbuf_empty(struct cbuf *cb) -{ - return cb->len == 0; -} - struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ @@ -117,8 +83,6 @@ struct connection { int (*rx_action) (struct connection *); /* What to do when active */ void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ - struct page *rx_page; - struct cbuf cb; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -126,6 +90,10 @@ struct connection { struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ + unsigned char *rx_buf; + int rx_buflen; + int rx_leftover; + struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -167,8 +135,8 @@ static struct workqueue_struct *recv_workqueue; static struct workqueue_struct *send_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; -static DEFINE_MUTEX(connections_lock); -static struct kmem_cache *con_cache; +static DEFINE_SPINLOCK(connections_lock); +DEFINE_STATIC_SRCU(connections_srcu); static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); @@ -184,15 +152,20 @@ static inline int nodeid_hash(int nodeid) static struct connection *__find_con(int nodeid) { - int r; + int r, idx; struct connection *con; r = nodeid_hash(nodeid); - hlist_for_each_entry(con, &connection_hash[r], list) { - if (con->nodeid == nodeid) + idx = srcu_read_lock(&connections_srcu); + hlist_for_each_entry_rcu(con, &connection_hash[r], list) { + if (con->nodeid == nodeid) { + srcu_read_unlock(&connections_srcu, idx); return con; + } } + srcu_read_unlock(&connections_srcu, idx); + return NULL; } @@ -200,21 +173,25 @@ static struct connection *__find_con(int nodeid) * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node. */ -static struct connection *__nodeid2con(int nodeid, gfp_t alloc) +static struct connection *nodeid2con(int nodeid, gfp_t alloc) { - struct connection *con = NULL; + struct connection *con, *tmp; int r; con = __find_con(nodeid); if (con || !alloc) return con; - con = kmem_cache_zalloc(con_cache, alloc); + con = kzalloc(sizeof(*con), alloc); if (!con) return NULL; - r = nodeid_hash(nodeid); - hlist_add_head(&con->list, &connection_hash[r]); + con->rx_buflen = dlm_config.ci_buffer_size; + con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); + if (!con->rx_buf) { + kfree(con); + return NULL; + } con->nodeid = nodeid; mutex_init(&con->sock_mutex); @@ -233,31 +210,41 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) con->rx_action = zerocon->rx_action; } + r = nodeid_hash(nodeid); + + spin_lock(&connections_lock); + /* Because multiple workqueues/threads calls this function it can + * race on multiple cpu's. Instead of locking hot path __find_con() + * we just check in rare cases of recently added nodes again + * under protection of connections_lock. If this is the case we + * abort our connection creation and return the existing connection. + */ + tmp = __find_con(nodeid); + if (tmp) { + spin_unlock(&connections_lock); + kfree(con->rx_buf); + kfree(con); + return tmp; + } + + hlist_add_head_rcu(&con->list, &connection_hash[r]); + spin_unlock(&connections_lock); + return con; } /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { - int i; - struct hlist_node *n; + int i, idx; struct connection *con; + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_safe(con, n, &connection_hash[i], list) + hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } -} - -static struct connection *nodeid2con(int nodeid, gfp_t allocation) -{ - struct connection *con; - - mutex_lock(&connections_lock); - con = __nodeid2con(nodeid, allocation); - mutex_unlock(&connections_lock); - - return con; + srcu_read_unlock(&connections_srcu, idx); } static struct dlm_node_addr *find_node_addr(int nodeid) @@ -614,11 +601,8 @@ static void close_connection(struct connection *con, bool and_other, /* Will only re-enter once. */ close_connection(con->othercon, false, true, true); } - if (con->rx_page) { - __free_page(con->rx_page); - con->rx_page = NULL; - } + con->rx_leftover = 0; con->retries = 0; mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); @@ -672,16 +656,33 @@ static void dlm_tcp_shutdown(struct connection *con) shutdown_connection(con); } +static int con_realloc_receive_buf(struct connection *con, int newlen) +{ + unsigned char *newbuf; + + newbuf = kmalloc(newlen, GFP_NOFS); + if (!newbuf) + return -ENOMEM; + + /* copy any leftover from last receive */ + if (con->rx_leftover) + memmove(newbuf, con->rx_buf, con->rx_leftover); + + /* swap to new buffer space */ + kfree(con->rx_buf); + con->rx_buflen = newlen; + con->rx_buf = newbuf; + + return 0; +} + /* Data received from remote end */ static int receive_from_sock(struct connection *con) { - int ret = 0; - struct msghdr msg = {}; - struct kvec iov[2]; - unsigned len; - int r; int call_again_soon = 0; - int nvec; + struct msghdr msg; + struct kvec iov; + int ret, buflen; mutex_lock(&con->sock_mutex); @@ -689,71 +690,55 @@ static int receive_from_sock(struct connection *con) ret = -EAGAIN; goto out_close; } + if (con->nodeid == 0) { ret = -EINVAL; goto out_close; } - if (con->rx_page == NULL) { - /* - * This doesn't need to be atomic, but I think it should - * improve performance if it is. - */ - con->rx_page = alloc_page(GFP_ATOMIC); - if (con->rx_page == NULL) + /* realloc if we get new buffer size to read out */ + buflen = dlm_config.ci_buffer_size; + if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { + ret = con_realloc_receive_buf(con, buflen); + if (ret < 0) goto out_resched; - cbuf_init(&con->cb, PAGE_SIZE); } - /* - * iov[0] is the bit of the circular buffer between the current end - * point (cb.base + cb.len) and the end of the buffer. + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes */ - iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); - iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); - iov[1].iov_len = 0; - nvec = 1; + iov.iov_base = con->rx_buf + con->rx_leftover; + iov.iov_len = con->rx_buflen - con->rx_leftover; - /* - * iov[1] is the bit of the circular buffer between the start of the - * buffer and the start of the currently used section (cb.base) - */ - if (cbuf_data(&con->cb) >= con->cb.base) { - iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb); - iov[1].iov_len = con->cb.base; - iov[1].iov_base = page_address(con->rx_page); - nvec = 2; - } - len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len); - - r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); if (ret <= 0) goto out_close; - else if (ret == len) + else if (ret == iov.iov_len) call_again_soon = 1; - cbuf_add(&con->cb, ret); - ret = dlm_process_incoming_buffer(con->nodeid, - page_address(con->rx_page), - con->cb.base, con->cb.len, - PAGE_SIZE); - if (ret < 0) { - log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d", - ret, page_address(con->rx_page), con->cb.base, - con->cb.len, r); - cbuf_eat(&con->cb, r); - } else { - cbuf_eat(&con->cb, ret); - } + /* new buflen according readed bytes and leftover from last receive */ + buflen = ret + con->rx_leftover; + ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); + if (ret < 0) + goto out_close; - if (cbuf_empty(&con->cb) && !call_again_soon) { - __free_page(con->rx_page); - con->rx_page = NULL; + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen - ret; + if (con->rx_leftover) { + memmove(con->rx_buf, con->rx_buf + ret, + con->rx_leftover); + call_again_soon = true; } if (call_again_soon) goto out_resched; + mutex_unlock(&con->sock_mutex); return 0; @@ -791,13 +776,11 @@ static int accept_from_sock(struct connection *con) int nodeid; struct connection *newcon; struct connection *addcon; + unsigned int mark; - mutex_lock(&connections_lock); if (!dlm_allow_conn) { - mutex_unlock(&connections_lock); return -1; } - mutex_unlock(&connections_lock); mutex_lock_nested(&con->sock_mutex, 0); @@ -830,6 +813,9 @@ static int accept_from_sock(struct connection *con) return -1; } + dlm_comm_mark(nodeid, &mark); + sock_set_mark(newsock->sk, mark); + log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This @@ -847,13 +833,24 @@ static int accept_from_sock(struct connection *con) struct connection *othercon = newcon->othercon; if (!othercon) { - othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); result = -ENOMEM; goto accept_err; } + + othercon->rx_buflen = dlm_config.ci_buffer_size; + othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS); + if (!othercon->rx_buf) { + mutex_unlock(&newcon->sock_mutex); + kfree(othercon); + log_print("failed to allocate incoming socket receive buffer"); + result = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); @@ -975,6 +972,8 @@ static void sctp_connect_to_sock(struct connection *con) return; } + dlm_comm_mark(con->nodeid, &mark); + mutex_lock(&con->sock_mutex); /* Some odd races can cause double-connects, ignore them */ @@ -999,11 +998,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto bind_err; - sock_set_mark(sock->sk, mark); con->rx_action = receive_from_sock; @@ -1076,6 +1070,8 @@ static void tcp_connect_to_sock(struct connection *con) return; } + dlm_comm_mark(con->nodeid, &mark); + mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -1090,11 +1086,6 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto out_err; - sock_set_mark(sock->sk, mark); memset(&saddr, 0, sizeof(saddr)); @@ -1238,6 +1229,14 @@ static void init_local(void) } } +static void deinit_local(void) +{ + int i; + + for (i = 0; i < dlm_local_count; i++) + kfree(dlm_local_addr[i]); +} + /* Initialise SCTP socket and bind to all interfaces */ static int sctp_listen_for_all(void) { @@ -1546,13 +1545,6 @@ static void process_send_sockets(struct work_struct *work) send_to_sock(con); } - -/* Discard all entries on the write queues */ -static void clean_writequeues(void) -{ - foreach_conn(clean_one_writequeue); -} - static void work_stop(void) { if (recv_workqueue) @@ -1608,26 +1600,34 @@ static void shutdown_conn(struct connection *con) con->shutdown_action(con); } +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + kfree(con->rx_buf); + kfree(con); +} + static void free_conn(struct connection *con) { close_connection(con, true, true, true); - if (con->othercon) - kmem_cache_free(con_cache, con->othercon); - hlist_del(&con->list); - kmem_cache_free(con_cache, con); + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + if (con->othercon) { + clean_one_writequeue(con->othercon); + call_rcu(&con->othercon->rcu, connection_release); + } + clean_one_writequeue(con); + call_rcu(&con->rcu, connection_release); } static void work_flush(void) { - int ok; + int ok, idx; int i; - struct hlist_node *n; struct connection *con; - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); do { ok = 1; foreach_conn(stop_conn); @@ -1635,9 +1635,10 @@ static void work_flush(void) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { - hlist_for_each_entry_safe(con, n, - &connection_hash[i], list) { + hlist_for_each_entry_rcu(con, &connection_hash[i], + list) { ok &= test_bit(CF_READ_PENDING, &con->flags); ok &= test_bit(CF_WRITE_PENDING, &con->flags); if (con->othercon) { @@ -1648,6 +1649,7 @@ static void work_flush(void) } } } + srcu_read_unlock(&connections_srcu, idx); } while (!ok); } @@ -1656,16 +1658,18 @@ void dlm_lowcomms_stop(void) /* Set all the flags to prevent any socket activity. */ - mutex_lock(&connections_lock); dlm_allow_conn = 0; - mutex_unlock(&connections_lock); + + if (recv_workqueue) + flush_workqueue(recv_workqueue); + if (send_workqueue) + flush_workqueue(send_workqueue); + foreach_conn(shutdown_conn); work_flush(); - clean_writequeues(); foreach_conn(free_conn); work_stop(); - - kmem_cache_destroy(con_cache); + deinit_local(); } int dlm_lowcomms_start(void) @@ -1684,16 +1688,9 @@ int dlm_lowcomms_start(void) goto fail; } - error = -ENOMEM; - con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), - __alignof__(struct connection), 0, - NULL); - if (!con_cache) - goto fail; - error = work_start(); if (error) - goto fail_destroy; + goto fail; dlm_allow_conn = 1; @@ -1710,12 +1707,8 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; con = nodeid2con(0,0); - if (con) { - close_connection(con, false, true, true); - kmem_cache_free(con_cache, con); - } -fail_destroy: - kmem_cache_destroy(con_cache); + if (con) + free_conn(con); fail: return error; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 921322d133e3..fde3a6afe4be 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -22,114 +22,84 @@ * into packets and sends them to the comms layer. */ +#include + #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "lock.h" #include "midcomms.h" - -static void copy_from_cb(void *dst, const void *base, unsigned offset, - unsigned len, unsigned limit) -{ - unsigned copy = len; - - if ((copy + offset) > limit) - copy = limit - offset; - memcpy(dst, base + offset, copy); - len -= copy; - if (len) - memcpy(dst + copy, base, len); -} - /* * Called from the low-level comms layer to process a buffer of * commands. - * - * Only complete messages are processed here, any "spare" bytes from - * the end of a buffer are saved and tacked onto the front of the next - * message that comes in. I doubt this will happen very often but we - * need to be able to cope with it and I don't want the task to be waiting - * for packets to come in when there is useful work to be done. */ -int dlm_process_incoming_buffer(int nodeid, const void *base, - unsigned offset, unsigned len, unsigned limit) +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) { - union { - unsigned char __buf[DLM_INBUF_LEN]; - /* this is to force proper alignment on some arches */ - union dlm_packet p; - } __tmp; - union dlm_packet *p = &__tmp.p; - int ret = 0; - int err = 0; + const unsigned char *ptr = buf; + const struct dlm_header *hd; uint16_t msglen; - uint32_t lockspace; + int ret = 0; - while (len > sizeof(struct dlm_header)) { + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; - /* Copy just the header to check the total length. The - message may wrap around the end of the buffer back to the - start, so we need to use a temp buffer and copy_from_cb. */ - - copy_from_cb(p, base, offset, sizeof(struct dlm_header), - limit); - - msglen = le16_to_cpu(p->header.h_length); - lockspace = p->header.h_lockspace; - - err = -EINVAL; - if (msglen < sizeof(struct dlm_header)) - break; - if (p->header.h_cmd == DLM_MSG) { - if (msglen < sizeof(struct dlm_message)) - break; - } else { - if (msglen < sizeof(struct dlm_rcom)) - break; + /* no message should be more than this otherwise we + * cannot deliver this message to upper layers + */ + msglen = get_unaligned_le16(&hd->h_length); + if (msglen > DEFAULT_BUFFER_SIZE) { + log_print("received invalid length header: %u, will abort message parsing", + msglen); + return -EBADMSG; } - err = -E2BIG; - if (msglen > dlm_config.ci_buffer_size) { - log_print("message size %d from %d too big, buf len %d", - msglen, nodeid, len); - break; - } - err = 0; - - /* If only part of the full message is contained in this - buffer, then do nothing and wait for lowcomms to call - us again later with more data. We return 0 meaning - we've consumed none of the input buffer. */ + /* caller will take care that leftover + * will be parsed next call with more data + */ if (msglen > len) break; - /* Allocate a larger temp buffer if the full message won't fit - in the buffer on the stack (which should work for most - ordinary messages). */ + switch (hd->h_cmd) { + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("dlm msg too small: %u, will skip this message", + msglen); + goto skip; + } - if (msglen > sizeof(__tmp) && p == &__tmp.p) { - p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); - if (p == NULL) - return ret; + break; + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("dlm rcom msg too small: %u, will skip this message", + msglen); + goto skip; + } + + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message", + hd->h_cmd); + goto skip; } - copy_from_cb(p, base, offset, msglen, limit); - - BUG_ON(lockspace != p->header.h_lockspace); + /* for aligned memory access, we just copy current message + * to begin of the buffer which contains already parsed buffer + * data and should provide align access for upper layers + * because the start address of the buffer has a aligned + * address. This memmove can be removed when the upperlayer + * is capable of unaligned memory access. + */ + memmove(buf, ptr, msglen); + dlm_receive_buffer((union dlm_packet *)buf, nodeid); +skip: ret += msglen; - offset += msglen; - offset &= (limit - 1); len -= msglen; - - dlm_receive_buffer(p, nodeid); + ptr += msglen; } - if (p != &__tmp.p) - kfree(p); - - return err ? err : ret; + return ret; } diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 2e122e81c8d0..61e90a921849 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -12,8 +12,7 @@ #ifndef __MIDCOMMS_DOT_H__ #define __MIDCOMMS_DOT_H__ -int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, - unsigned len, unsigned limit); +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); #endif /* __MIDCOMMS_DOT_H__ */ diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 459ecb42cbd3..347be146884c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -224,7 +224,7 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio, bio_set_dev(bio, sb->s_bdev); bio->bi_iter.bi_sector = (sector_t)blknr << LOG_SECTORS_PER_BLOCK; - bio->bi_opf = REQ_OP_READ; + bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0); } err = bio_add_page(bio, page, PAGE_SIZE, 0); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ddaa516c008a..b9a09806512a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) enum { Opt_user_xattr, - Opt_nouser_xattr, Opt_acl, - Opt_noacl, Opt_cache_strategy, Opt_err }; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 0416f3437b95..58cf0cf1b818 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -474,8 +474,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; break; case EROFS_XATTR_INDEX_SECURITY: break; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6c939def00f9..50912a5420b4 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend { struct z_erofs_collector clt; struct erofs_map_blocks map; + bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -153,8 +154,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, - enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + enum z_erofs_cache_alloctype type) { const struct z_erofs_pcluster *pcl = clt->pcl; const unsigned int clusterpages = BIT(pcl->clusterbits); @@ -562,8 +562,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, - struct list_head *pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -620,8 +619,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, else cache_strategy = DONTALLOC; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy); hitted: /* @@ -653,7 +651,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, /* should allocate an additional staging page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = - erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL); + alloc_page(GFP_NOFS | __GFP_NOFAIL); newpage->mapping = Z_EROFS_MAPPING_STAGING; err = z_erofs_attach_page(clt, newpage, @@ -1151,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, } static void z_erofs_submit_queue(struct super_block *sb, - z_erofs_next_pcluster_t owned_head, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) @@ -1160,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb, z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; + z_erofs_next_pcluster_t owned_head = f->clt.owned_head; /* since bio will be NULL, no need to initialize last_index */ pgoff_t last_index; unsigned int nr_bios = 0; @@ -1193,7 +1192,6 @@ static void z_erofs_submit_queue(struct super_block *sb, do { struct page *page; - int err; page = pickup_page_for_submission(pcl, i++, pagepool, MNGD_MAPPING(sbi), @@ -1216,11 +1214,12 @@ static void z_erofs_submit_queue(struct super_block *sb, LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; bio->bi_opf = REQ_OP_READ; + if (f->readahead) + bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } - err = bio_add_page(bio, page, PAGE_SIZE, 0); - if (err < PAGE_SIZE) + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) goto submit_bio_retry; last_index = cur; @@ -1248,14 +1247,14 @@ static void z_erofs_submit_queue(struct super_block *sb, } static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_collector *clt, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg); + z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1282,11 +1281,11 @@ static int z_erofs_readpage(struct file *file, struct page *page) f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, true); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1299,25 +1298,20 @@ static int z_erofs_readpage(struct file *file, struct page *page) return err; } -static bool should_decompress_synchronously(struct erofs_sb_info *sbi, - unsigned int nr) -{ - return nr <= sbi->ctx.max_sync_decompress_pages; -} - static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); + unsigned int nr_pages = readahead_count(rac); + bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(inode, readahead_index(rac), - readahead_count(rac), false); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + f.readahead = true; f.headoffset = readahead_pos(rac); while ((page = readahead_page(rac))) { @@ -1341,7 +1335,7 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", @@ -1351,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); if (f.map.mpage) put_page(f.map.mpage); diff --git a/include/linux/fs.h b/include/linux/fs.h index 7010ff2aa11e..612592a1eec7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3089,8 +3089,6 @@ enum { DIO_SKIP_HOLES = 0x02, }; -void dio_end_io(struct bio *bio); - ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 7d46411ffaa2..42df06c6b19c 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -297,6 +297,8 @@ struct mmc_card { struct sdio_cis cis; /* common tuple info */ struct sdio_func *sdio_func[SDIO_MAX_FUNCS]; /* SDIO functions (devices) */ struct sdio_func *sdio_single_irq; /* SDIO function when only one IRQ active */ + u8 major_rev; /* major revision number */ + u8 minor_rev; /* minor revision number */ unsigned num_info; /* number of info strings */ const char **info; /* info strings */ struct sdio_func_tuple *tuples; /* unknown common tuples */ diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index c5b6e97cb21a..c079b932330f 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -163,6 +163,7 @@ struct mmc_host_ops { int (*select_drive_strength)(struct mmc_card *card, unsigned int max_dtr, int host_drv, int card_drv, int *drv_type); + /* Reset the eMMC card via RST_n */ void (*hw_reset)(struct mmc_host *host); void (*card_event)(struct mmc_host *host); @@ -346,7 +347,7 @@ struct mmc_host { #define MMC_CAP_CD_WAKE (1 << 28) /* Enable card detect wake */ #define MMC_CAP_CMD_DURING_TFR (1 << 29) /* Commands during data transfer */ #define MMC_CAP_CMD23 (1 << 30) /* CMD23 supported. */ -#define MMC_CAP_HW_RESET (1 << 31) /* Hardware reset */ +#define MMC_CAP_HW_RESET (1 << 31) /* Reset the eMMC card via RST_n */ u32 caps2; /* More host capabilities */ @@ -399,6 +400,7 @@ struct mmc_host { unsigned int use_spi_crc:1; unsigned int claimed:1; /* host exclusively claimed */ unsigned int bus_dead:1; /* bus has been released */ + unsigned int doing_init_tune:1; /* initial tuning in progress */ unsigned int can_retune:1; /* re-tuning can be used */ unsigned int doing_retune:1; /* re-tuning in progress */ unsigned int retune_now:1; /* do re-tuning at next req */ @@ -594,6 +596,11 @@ static inline bool mmc_doing_retune(struct mmc_host *host) return host->doing_retune == 1; } +static inline bool mmc_doing_tune(struct mmc_host *host) +{ + return host->doing_retune == 1 || host->doing_init_tune == 1; +} + static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data) { return data->flags & MMC_DATA_WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE; diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h index fa2aaab5e57a..478855b8e406 100644 --- a/include/linux/mmc/sdio_func.h +++ b/include/linux/mmc/sdio_func.h @@ -51,6 +51,8 @@ struct sdio_func { u8 *tmpbuf; /* DMA:able scratch buffer */ + u8 major_rev; /* major revision number */ + u8 minor_rev; /* minor revision number */ unsigned num_info; /* number of info strings */ const char **info; /* info strings */ diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 863335ecb7e8..ecd24c719de4 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -79,6 +79,7 @@ struct btrfs_space_info; #define IO_TREE_OWNER \ EM( IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS") \ EM( IO_TREE_FS_EXCLUDED_EXTENTS, "EXCLUDED_EXTENTS") \ + EM( IO_TREE_BTREE_INODE_IO, "BTREE_INODE_IO") \ EM( IO_TREE_INODE_IO, "INODE_IO") \ EM( IO_TREE_INODE_IO_FAILURE, "INODE_IO_FAILURE") \ EM( IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS") \ @@ -510,7 +511,7 @@ DEFINE_EVENT( DECLARE_EVENT_CLASS(btrfs__ordered_extent, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered), @@ -529,8 +530,8 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __field( u64, truncated_len ) ), - TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), - __entry->ino = btrfs_ino(BTRFS_I(inode)); + TP_fast_assign_btrfs(inode->root->fs_info, + __entry->ino = btrfs_ino(inode); __entry->file_offset = ordered->file_offset; __entry->start = ordered->disk_bytenr; __entry->len = ordered->num_bytes; @@ -539,8 +540,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __entry->flags = ordered->flags; __entry->compress_type = ordered->compress_type; __entry->refs = refcount_read(&ordered->refs); - __entry->root_objectid = - BTRFS_I(inode)->root->root_key.objectid; + __entry->root_objectid = inode->root->root_key.objectid; __entry->truncated_len = ordered->truncated_len; ), @@ -563,7 +563,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -571,7 +571,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -579,7 +579,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -587,7 +587,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -1176,25 +1176,27 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TRACE_EVENT(find_free_extent, - TP_PROTO(const struct btrfs_fs_info *fs_info, u64 num_bytes, + TP_PROTO(const struct btrfs_root *root, u64 num_bytes, u64 empty_size, u64 data), - TP_ARGS(fs_info, num_bytes, empty_size, data), + TP_ARGS(root, num_bytes, empty_size, data), TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) __field( u64, data ) ), - TP_fast_assign_btrfs(fs_info, + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; __entry->num_bytes = num_bytes; __entry->empty_size = empty_size; __entry->data = data; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", - show_root_type(BTRFS_EXTENT_TREE_OBJECTID), + show_root_type(__entry->root_objectid), __entry->num_bytes, __entry->empty_size, __entry->data, __print_flags((unsigned long)__entry->data, "|", BTRFS_GROUP_FLAGS)) diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 9ba64ca6b4ac..6b885982ece6 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -4,6 +4,11 @@ #include #include +#ifdef __KERNEL__ +#include +#else +#include +#endif /* * This header contains the structure definitions and constants used @@ -644,6 +649,15 @@ struct btrfs_root_item { __le64 reserved[8]; /* for future */ } __attribute__ ((__packed__)); +/* + * Btrfs root item used to be smaller than current size. The old format ends + * at where member generation_v2 is. + */ +static inline __u32 btrfs_legacy_root_item_size(void) +{ + return offsetof(struct btrfs_root_item, generation_v2); +} + /* * this is used for both forward and backward root refs */